This is an R Markdown document. Markdown is a Simple formatting syntax for authoring HTML, PDF, and MS Word documents. Html_fragmented is quicker to load than html, hence for this pipeline which genenerated a large mount of data, we have used html_fragment to report Rmarkdown.
#1) The present analysis was done on MacOS, using knitToHtmlfragment.
To reuse this code for other datasets a) in this pipeline human gene symbols are used
SVA+LM or SVA+lmY or none and DEG_Limma normalization for study or experiment or batch correction and removal of gender
#save working directory location
wd<-getwd()
wd
## [1] "C:/Users/SMukherjee/Desktop/VM-share/extra/DEG_NoCuff_CellTypeBrainImmune(notStemCells_6-25-19)/CellType_macrophagesBYo_FC5UP"
#source("https://bioconductor.org/biocLite.R")
#biocLite(c("sva","limma","bladderbatch","pamr"))
library(limma)
library(sva)
## Loading required package: mgcv
## Warning: package 'mgcv' was built under R version 3.5.3
## Loading required package: nlme
## Warning: package 'nlme' was built under R version 3.5.3
## This is mgcv 1.8-28. For overview type 'help("mgcv-package")'.
## Loading required package: genefilter
## Loading required package: BiocParallel
library(pamr)
## Warning: package 'pamr' was built under R version 3.5.3
## Loading required package: cluster
## Warning: package 'cluster' was built under R version 3.5.3
## Loading required package: survival
## Warning: package 'survival' was built under R version 3.5.3
library(ggplot2) # plot
## Warning: package 'ggplot2' was built under R version 3.5.3
library(ggmap)#plots
## Warning: package 'ggmap' was built under R version 3.5.3
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
library(gplots)#plots
## Warning: package 'gplots' was built under R version 3.5.3
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(RColorBrewer)#color pallet
library(Hmisc)#for corelation plot
## Warning: package 'Hmisc' was built under R version 3.5.3
## Loading required package: lattice
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
library(viridis)#for corelation plot
## Warning: package 'viridis' was built under R version 3.5.3
## Loading required package: viridisLite
## Warning: package 'viridisLite' was built under R version 3.5.3
library(corrplot)#for corelation plot
## Warning: package 'corrplot' was built under R version 3.5.3
## corrplot 0.84 loaded
library(filesstrings)#for file organization
## Warning: package 'filesstrings' was built under R version 3.5.3
## Loading required package: stringr
## Warning: package 'stringr' was built under R version 3.5.3
library(expss)#for handling NAs
## Warning: package 'expss' was built under R version 3.5.3
##
## Attaching package: 'expss'
## The following objects are masked from 'package:stringr':
##
## fixed, regex
## The following object is masked from 'package:ggplot2':
##
## vars
#Metadata
metadata=read.csv("Step5D_result_merge_GSE52564_to_SRP035309_metadata_SampleID.csv", header=T, sep=',')
#Expression data
Expr_GeneMs=read.csv("Step5D_result_merge_GSE52564_to_SRP035309_Expr_Gene_SampleID.csv", header =T, sep=',')
#Rename the gene symbol column from X to GeneMs
colnames(Expr_GeneMs)[colnames(Expr_GeneMs) == 'X'] <- 'GeneMs'
dim(metadata)
## [1] 30 9
dim(Expr_GeneMs)
## [1] 22123 31
#Need to make sure that the Gene IDs column labeled as GeneMs are unique to specify rownames with Gene IDs
Expr_GeneMs = Expr_GeneMs[!duplicated(Expr_GeneMs$GeneMs),]
dim(metadata)
## [1] 30 9
dim(Expr_GeneMs)
## [1] 22123 31
#We do not lose Gene IDs in RNA-seq (has no duplicates) unline microarray (has duplicates)
Expr_GeneMs_1=Expr_GeneMs[,-1] #get rid of extra column
rownames(Expr_GeneMs_1)=Expr_GeneMs$GeneMs
Expr_GeneMs_1=Expr_GeneMs_1[complete.cases(Expr_GeneMs_1), ]
edata=Expr_GeneMs_1
#Optional filter low counts or else svobj step compalins about missing values and all zero rows
#Expr_GeneMs_2=Expr_GeneMs_1[rowSums(Expr_GeneMs_1)>0,]
#edata=Expr_GeneMs_2
#From here identify column numbers of interest for next step
colnames(metadata)
## [1] "X" "Sample_Name" "MsGroup_Name"
## [4] "CellType" "Age" "Study"
## [7] "Species" "Tissue" "CellTypeDescription"
metadata_1=metadata[,c("CellType","Age","Study","Species","Tissue")]
#metadata_1=metadata[,4:8]
rownames(metadata_1)=metadata$Sample_Name
#View(metadata_1)
colnames(metadata_1)
## [1] "CellType" "Age" "Study" "Species" "Tissue"
#only keep the variables that will be considered in the mod, mod null and linear regression
#metadata_2=metadata_1[,1:5]
metadata_2=metadata_1[,c("CellType","Age","Study","Species","Tissue")]
rownames(metadata_2)=rownames(metadata_1)
#Optional way to drop metadata pheno trait column. Instead of dropping metadata pheno trait column in this pipeline we only provide traits of interest in the model.matrix steps
#metadata_2=metadata_2[!names(metadata_2) %in% c("Tissue")]
pheno=metadata_2
#View(metadata_2)
#View(pheno)
colnames(pheno)
## [1] "CellType" "Age" "Study" "Species" "Tissue"
DT::datatable(pheno)
DT::datatable(edata[1:7,1:7])
dim(metadata)
## [1] 30 9
dim(pheno)
## [1] 30 5
dim(Expr_GeneMs) #dropped off 1 extra column
## [1] 22123 31
dim(edata)
## [1] 22123 30
#Make sure we did not lose any samples in expression data
#Here in the output the 1st two and the last two should be same i.e. the first and last samples are same
colnames(Expr_GeneMs[2])
## [1] "astro_SRR1033783"
colnames(edata[1])
## [1] "astro_SRR1033783"
colnames(Expr_GeneMs[22])
## [1] "monocytes_SRR5483451"
colnames(edata[21])
## [1] "monocytes_SRR5483451"
t_edata<-t(edata)
DT::datatable(t_edata[,1:7])
DT::datatable(pheno)
dim(t_edata)
## [1] 30 22123
dim(pheno)
## [1] 30 5
#bind the pheno and edata, this file can be used to predict trait based on a particular gene's gene expression
pheno_t_edata<-cbind(pheno, t_edata)
dim(pheno_t_edata)
## [1] 30 22128
DT::datatable(pheno_t_edata[,1:10])
write.csv(pheno_t_edata, "Step7_result_pheno_t_edata_All_Group_aggregate.csv")
#select the CellType pheno drop others
pheno_t_edata_CellType=pheno_t_edata[!names(pheno_t_edata) %in% c("Age", "Study", "Species", "Tissue")]
dim(pheno_t_edata_CellType)
## [1] 30 22124
DT::datatable(pheno_t_edata_CellType[,1:10])
#aggregate edata or expression for the CellType pheno
#https://campus.datacamp.com/courses/data-table-data-manipulation-r-tutorial/chapter-two-datatable-yeoman?ex=6
library(data.table)
## Warning: package 'data.table' was built under R version 3.5.3
##
## Attaching package: 'data.table'
## The following object is masked from 'package:expss':
##
## copy
setDT(pheno_t_edata_CellType)
pheno_t_edata_CellType_avg=pheno_t_edata_CellType[, lapply(.SD, mean), by = .(CellType)]
DT::datatable(pheno_t_edata_CellType_avg[,1:10])
#transpose genes to rows and variable to column
t_pheno_t_edata_CellType_avg=t(pheno_t_edata_CellType_avg[,-1])
colnames(t_pheno_t_edata_CellType_avg)=pheno_t_edata_CellType_avg$CellType
dim(t_pheno_t_edata_CellType_avg)
## [1] 22123 2
DT::datatable(t_pheno_t_edata_CellType_avg[1:10,])
write.csv(t_pheno_t_edata_CellType_avg, "Step7_result_edata_CellType_aggregate.csv")
#select the Species pheno drop others
pheno_t_edata_Species=pheno_t_edata[!names(pheno_t_edata) %in% c("Age", "Study", "CellType", "Tissue")]
dim(pheno_t_edata_Species)
## [1] 30 22124
DT::datatable(pheno_t_edata_Species[,1:10])
#aggregate edata or expression for the Species pheno
#https://campus.datacamp.com/courses/data-table-data-manipulation-r-tutorial/chapter-two-datatable-yeoman?ex=6
library(data.table)
setDT(pheno_t_edata_Species)
pheno_t_edata_Species_avg=pheno_t_edata_Species[, lapply(.SD, mean), by = .(Species)]
DT::datatable(pheno_t_edata_Species_avg[,1:10])
#transpose genes to rows and variable to column
t_pheno_t_edata_Species_avg=t(pheno_t_edata_Species_avg[,-1])
colnames(t_pheno_t_edata_Species_avg)=pheno_t_edata_Species_avg$Species
dim(t_pheno_t_edata_Species_avg)
## [1] 22123 2
DT::datatable(t_pheno_t_edata_Species_avg[1:10,])
write.csv(t_pheno_t_edata_Species_avg, "Step7_result_edata_Species_aggregate.csv")
#select the Age pheno drop others
pheno_t_edata_Age=pheno_t_edata[!names(pheno_t_edata) %in% c("CellType", "Study", "Species", "Tissue")]
dim(pheno_t_edata_Age)
## [1] 30 22124
DT::datatable(pheno_t_edata_Age[,1:10])
#aggregate edata or expression for the Age pheno
#https://campus.datacamp.com/courses/data-table-data-manipulation-r-tutorial/chapter-two-datatable-yeoman?ex=6
library(data.table)
setDT(pheno_t_edata_Age)
pheno_t_edata_Age_avg=pheno_t_edata_Age[, lapply(.SD, mean), by = .(Age)]
DT::datatable(pheno_t_edata_Age_avg[,1:10])
#transpose genes to rows and variable to column
t_pheno_t_edata_Age_avg=t(pheno_t_edata_Age_avg[,-1])
colnames(t_pheno_t_edata_Age_avg)=pheno_t_edata_Age_avg$Age
dim(t_pheno_t_edata_Age_avg)
## [1] 22123 2
DT::datatable(t_pheno_t_edata_Age_avg[1:10,])
write.csv(t_pheno_t_edata_Age_avg, "Step7_result_edata_Age_aggregate.csv")
#select the Tissue pheno drop others
pheno_t_edata_Tissue=pheno_t_edata[!names(pheno_t_edata) %in% c("CellType", "Study", "Species", "Age")]
dim(pheno_t_edata_Tissue)
## [1] 30 22124
DT::datatable(pheno_t_edata_Tissue[,1:10])
#aggregate edata or expression for the Tissue pheno
#https://campus.datacamp.com/courses/data-table-data-manipulation-r-tutorial/chapter-two-datatable-yeoman?ex=6
library(data.table)
setDT(pheno_t_edata_Tissue)
pheno_t_edata_Tissue_avg=pheno_t_edata_Tissue[, lapply(.SD, mean), by = .(Tissue)]
DT::datatable(pheno_t_edata_Tissue_avg[,1:10])
#transpose genes to rows and variable to column
t_pheno_t_edata_Tissue_avg=t(pheno_t_edata_Tissue_avg[,-1])
colnames(t_pheno_t_edata_Tissue_avg)=pheno_t_edata_Tissue_avg$Tissue
dim(t_pheno_t_edata_Tissue_avg)
## [1] 22123 4
DT::datatable(t_pheno_t_edata_Tissue_avg[1:10,])
write.csv(t_pheno_t_edata_Tissue_avg, "Step7_result_edata_Tissue_aggregate.csv")
t_pheno_t_edata_all_avg<-cbind(t_pheno_t_edata_CellType_avg, t_pheno_t_edata_Species_avg, t_pheno_t_edata_Age_avg, t_pheno_t_edata_Tissue_avg)
DT::datatable(head(t_pheno_t_edata_all_avg))
write.csv(t_pheno_t_edata_all_avg, "Step7_result_edata_All_Group_aggregate.csv")
#save pheno_t_edata, t_pheno_t_edata_CellType_avg, t_pheno_t_edata_Species_avg, t_pheno_t_edata_Age_avg, t_pheno_t_edata_Tissue_avg, t_pheno_t_edata_all_avg
save(pheno_t_edata, t_pheno_t_edata_CellType_avg, t_pheno_t_edata_Species_avg, t_pheno_t_edata_Age_avg, t_pheno_t_edata_Tissue_avg, t_pheno_t_edata_all_avg, file="edata_aggregate_data.RData")
pdf(file="Step7_plot_boxplot_corr_plot_edata_aka_expr_aggregate.pdf",height=10,width=10)
par(mfrow=c(2,1))
#########boxplot CellType Species Age Tissue and all ########
#box plot on aggregate CellType
boxplot(t_pheno_t_edata_CellType_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_CellType_avg", col="yellow")
#box plot on aggregate Species
boxplot(t_pheno_t_edata_Species_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_Species_avg", col="yellow")
#box plot on aggregate Age
boxplot(t_pheno_t_edata_Age_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_Age_avg", col="yellow")
#box plot on aggregate Tissue
boxplot(t_pheno_t_edata_Tissue_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_Tissue_avg", col="yellow")
#box plot on aggregate all
boxplot(t_pheno_t_edata_all_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_all_avg", col="yellow")
#########corr plot CellType Species Age Tissue and all ########
#create cor matrix on aggregate CellType
rel2 <- rcorr(as.matrix(t_pheno_t_edata_CellType_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step7_result_edata_CellType_Group_corr_pval_aggregate.csv")
#create cor matrix on aggregate Species
rel2 <- rcorr(as.matrix(t_pheno_t_edata_Species_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step7_result_edata_Species_Group_corr_pval_aggregate.csv")
#create cor matrix on aggregate Age
rel2 <- rcorr(as.matrix(t_pheno_t_edata_Age_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step7_result_edata_Age_Group_corr_pval_aggregate.csv")
#create cor matrix on aggregate Tissue
rel2 <- rcorr(as.matrix(t_pheno_t_edata_Tissue_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step7_result_edata_Tissue_Group_corr_pval_aggregate.csv")
#create cor matrix on aggregate all
rel2 <- rcorr(as.matrix(t_pheno_t_edata_all_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step7_result_edata_all_Group_corr_pval_aggregate.csv")
dev.off()
## png
## 2
pdf(file="Step7_plot_density_edata_aka_expr_aggregate.pdf",height=10,width=10)
#########density plot CellType Species Age Tissue and all ########
#density plot on aggregate CellType
df=as.data.frame(t_pheno_t_edata_CellType_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_CellType_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) +
geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
labs(title="gene expression density curve for CellType",x="t_pheno_t_edata_CellType_avg", y = "Density")
#density plot on aggregate Species
df=as.data.frame(t_pheno_t_edata_Species_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_Species_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) +
geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
labs(title="gene expression density curve for Species",x="t_pheno_t_edata_Species_avg", y = "Density")
#density plot on aggregate Age
df=as.data.frame(t_pheno_t_edata_Age_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_Age_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) +
geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
labs(title="gene expression density curve for Age",x="t_pheno_t_edata_Age_avg", y = "Density")
#density plot on aggregate Tissue
df=as.data.frame(t_pheno_t_edata_Tissue_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_Tissue_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) +
geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
labs(title="gene expression density curve for Tissue",x="t_pheno_t_edata_Tissue_avg", y = "Density")
#density plot on aggregate all
df=as.data.frame(t_pheno_t_edata_all_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_all_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) +
geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
labs(title="gene expression density curve for all",x="t_pheno_t_edata_all_avg", y = "Density")
dev.off()
## png
## 2
edata_pca_unscaled filtered to pca_unscaled genes
edata_pca_unscaled=edata
#Summary Statistics edata_pca_unscaled
DT::datatable(summary(edata_pca_unscaled[1:7,1:7]))
write.csv(summary(edata_pca_unscaled),"Step7_result_summary_stats_edata_pca_unscaled.csv")
pdf(file="Step7_plot_Visualization_boxplot_MDS_PC_pca_unscaled_for_edata.pdf",height=10,width=10)
par(mfrow=c(2,1))
############set colors for chunk Study###############
nconditions <- nlevels(as.factor(pheno$Study))
npal <- colorRampPalette(brewer.pal(nconditions, "Set1"))(nconditions)
condition_colors <- npal[as.integer(pheno$Study)]
############boxplot chunk###############
#Before boxplot edata_pca_unscaled
par(mai=c(1,0.8,1,0.8))
boxplot(edata_pca_unscaled, outline=FALSE, las=2, cex=0.25, main="edata_pca_unscaled", col="yellow")
############MDSplot chunk###############
#Before MDSplot edata_pca_unscaled
par(mai=c(1,0.8,1,0.8))
plotMDS(edata_pca_unscaled, col=condition_colors, legend= "all", main="edata_pca_unscaled", cex=0.5)#like PCA plot
## Warning in plot.window(...): "legend" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "legend" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "legend" is
## not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "legend" is
## not a graphical parameter
## Warning in box(...): "legend" is not a graphical parameter
## Warning in title(...): "legend" is not a graphical parameter
## Warning in text.default(x$x, x$y, labels = labels, cex = cex, ...):
## "legend" is not a graphical parameter
#######PCAplot chunk CellType colored by Study or batch########
#Before PCAplot edata_pca_unscaled, colored by Study or batch, labels CellType
#pca0=prcomp(t(edata_pca_unscaled), center=T, scale =T)
pca0=prcomp(t(edata_pca_unscaled))
names(pca0)
## [1] "sdev" "rotation" "center" "scale" "x"
#summary(pca0)
pca0$x[1:3,1:3]
## PC1 PC2 PC3
## astro_SRR1033783 -189.88574 -31.84178 -53.95528
## astro_SRR1033784 -193.28961 -34.46141 -60.07396
## endo_SRR1033795 -93.02506 47.94031 52.68268
plot(x=pca0$x[,1], y=pca0$x[,2],
cex=0,
xlab="PC1", ylab="PC2",
main="PC plot colored by Study edata_pca_unscaled",
font=2
)
text(x=pca0$x[,1], y=pca0$x[,2],
labels=metadata_1$CellType,
col=as.numeric(metadata_1$Study),
cex=0.7, font=2)
legend("bottomright",
legend=c("GSE52564_to_SRP035309"),
text.col=c("red", "blue", "green")
)
#########PCAplot chunk Species colored by Study or batch######
#Before PCAplot edata_pca_unscaled, colored by Study or batch, labels Species
#pca0=prcomp(t(edata_pca_unscaled), center=T, scale=T)
pca0=prcomp(t(edata_pca_unscaled))
names(pca0)
## [1] "sdev" "rotation" "center" "scale" "x"
#summary(pca0)
pca0$x[1:3,1:3]
## PC1 PC2 PC3
## astro_SRR1033783 -189.88574 -31.84178 -53.95528
## astro_SRR1033784 -193.28961 -34.46141 -60.07396
## endo_SRR1033795 -93.02506 47.94031 52.68268
plot(x=pca0$x[,1], y=pca0$x[,2],
cex=0,
xlab="PC1", ylab="PC2",
main="PC plot colored by Study edata_pca_unscaled",
font=2
)
text(x=pca0$x[,1], y=pca0$x[,2],
labels=metadata_1$Species,
col=as.numeric(metadata_1$Study),
cex=0.7, font=2)
legend("bottomright",
legend=c("GSE52564_to_SRP035309"),
text.col=c("red", "blue", "green")
)
#########PCAplot chunk Age colored by Study or batch########
#Before PCAplot edata_pca_unscaled, colored by Study or batch, labels Age
#pca0=prcomp(t(edata_pca_unscaled), center=T, scale=T)
pca0=prcomp(t(edata_pca_unscaled))
names(pca0)
## [1] "sdev" "rotation" "center" "scale" "x"
#summary(pca0)
pca0$x[1:3,1:3]
## PC1 PC2 PC3
## astro_SRR1033783 -189.88574 -31.84178 -53.95528
## astro_SRR1033784 -193.28961 -34.46141 -60.07396
## endo_SRR1033795 -93.02506 47.94031 52.68268
plot(x=pca0$x[,1], y=pca0$x[,2],
cex=0,
xlab="PC1", ylab="PC2",
main="PC plot colored by Study edata_pca_unscaled",
font=2
)
text(x=pca0$x[,1], y=pca0$x[,2],
labels=metadata_1$Age,
col=as.numeric(metadata_1$Study),
cex=0.7, font=2)
legend("bottomright",
legend=c("GSE52564_to_SRP035309"),
text.col=c("red", "blue", "green")
)
dev.off()
## png
## 2
save.image(file="done_pre_Step8.RData")
See DEG pipeline, uses edata and modified metadata
#Load pheno, edata and edata_combatY required for steps below
load(file="done_pre_Step8.RData")
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4131120 220.7 8074265 431.3 8074265 431.3
## Vcells 17020970 129.9 30716681 234.4 21625254 165.0
#load required libraries
#source("https://bioconductor.org/biocLite.R")
#biocLite(c("sva","limma","bladderbatch","pamr"))
library(limma)
library(sva)
library(pamr)
library(ggplot2) # plot
library(ggrepel) #Avoid overlapping labels
## Warning: package 'ggrepel' was built under R version 3.5.3
library(ggmap)#plots
library(gplots)#plots
library(RColorBrewer)#color pallet
library(Hmisc)#for corelation plot
library(viridis)#for corelation plot
library(corrplot)#for corelation plot
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:expss':
##
## between, compute, first, last, na_if, recode, vars
## The following object is masked from 'package:filesstrings':
##
## all_equal
## The following objects are masked from 'package:Hmisc':
##
## src, summarize
## The following object is masked from 'package:nlme':
##
## collapse
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#From here identify column numbers of interest for next step
colnames(pheno)
## [1] "CellType" "Age" "Study" "Species" "Tissue"
#Note here we use the generic DEG_Limma differential expression calculation step
#mod only includes the variables of interest and covariants, not surrogate variables
#https://www.rdocumentation.org/packages/limma/versions/3.28.14/topics/lmFit
#fit_DEG_Limma=lmFit(edata,mod)
mod_DEG_Limma = model.matrix(~0+CellType+Study, data=pheno)
fit_DEG_Limma=lmFit(edata,mod_DEG_Limma)
summary(fit_DEG_Limma)
## Length Class Mode
## coefficients 88492 -none- numeric
## rank 1 -none- numeric
## assign 4 -none- numeric
## qr 5 qr list
## df.residual 22123 -none- numeric
## sigma 22123 -none- numeric
## cov.coefficients 16 -none- numeric
## stdev.unscaled 88492 -none- numeric
## pivot 4 -none- numeric
## Amean 22123 -none- numeric
## method 1 -none- character
## design 120 -none- numeric
DT::datatable(fit_DEG_Limma$coefficients)
## Warning in instance$preRenderHook(instance): It seems your data is too
## big for client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html
#Coefficients if not estimable i.e. gives NAs in coefficient then remove. Try to use the max number of variables
#mod_DEG_Limma = model.matrix(~0+CellType+Study, data=pheno)
#mod_DEG_Limma = model.matrix(~0+CellType+Study+Species+Tissue+Age, data=pheno)
#mod_DEG_Limma = model.matrix(~0+CellType+Species+Age, data=pheno)
#Use the values in the creating_CellType_contrasts as the first three values of contrast matrix for CellType
design_DEG_Limma<-model.matrix(~0+CellType, data=pheno)
colnames(design_DEG_Limma)
## [1] "CellTypemacrophages" "CellTypeother"
design_DEG_Limma[1:3,]
## CellTypemacrophages CellTypeother
## astro_SRR1033783 0 1
## astro_SRR1033784 0 1
## endo_SRR1033795 0 1
DT::datatable(design_DEG_Limma)
#comparisons you want write here
creating_CellType_contrasts_DEG_Limma<-makeContrasts(macrophagesBYo=CellTypemacrophages-CellTypeother, levels =design_DEG_Limma)
creating_CellType_contrasts_DEG_Limma
## Contrasts
## Levels macrophagesBYo
## CellTypemacrophages 1
## CellTypeother -1
#Create contrasts to find the differentially expressed genes .
#We hold other variables and surrogate variables at 0 (NA since we are using combat)
#Here the net number of rows is same and correspond to columns in fit_DEG_Limma$coefficients
contrast.matrix_DEG_Limma <- cbind("macrophagesBYo"=c(1,-1,rep(0,2)))
contrast.matrix_DEG_Limma
## macrophagesBYo
## [1,] 1
## [2,] -1
## [3,] 0
## [4,] 0
#Identification of differentially expressed genes for the contrasts
fitContrasts_DEG_Limma=contrasts.fit(fit_DEG_Limma, contrast.matrix_DEG_Limma)
#calculate empirical Bayes variabce
eb_DEG_Limma=eBayes(fitContrasts_DEG_Limma)
#top table and results table for differentially expressed genes for comparison of all levels. This is different from the pairwise contrast comparison
result_tT_full_DEG_Limma<- topTableF(eb_DEG_Limma, adjust="BH", n=nrow(edata))
result_tT_FC5.00_DEG_Limma <- result_tT_full_DEG_Limma[which(result_tT_full_DEG_Limma$adj.P.Val <= 0.05), ]
dim(result_tT_full_DEG_Limma)
## [1] 22123 5
dim(result_tT_FC5.00_DEG_Limma)
## [1] 1132 5
write.csv(result_tT_full_DEG_Limma, "Step12_result_tT_full_DEG_Limma.csv")
write.csv(result_tT_FC5.00_DEG_Limma, "Step12_result_tT_FC5.00_DEG_Limma.csv")
#DEG_Limma for contrast "macrophagesBYo"=CellTypemacrophages-CellTypeother
result_macrophagesBYo_full_DEG_Limma<- topTable(eb_DEG_Limma, number = nrow(edata), adjust = "BH", coef="macrophagesBYo")
result_macrophagesBYo_full_DEG_Limma$Contrast<-rep("macrophagesBYo",nrow(result_macrophagesBYo_full_DEG_Limma))
result_macrophagesBYo_full_DEG_Limma$FC<- 2^(result_macrophagesBYo_full_DEG_Limma$logFC)
result_macrophagesBYo_FC5.00_DEG_Limma<- result_macrophagesBYo_full_DEG_Limma[which(result_macrophagesBYo_full_DEG_Limma$FC >= 5.00), ]
dim(result_macrophagesBYo_full_DEG_Limma)
## [1] 22123 8
dim(result_macrophagesBYo_FC5.00_DEG_Limma)
## [1] 545 8
write.csv(result_macrophagesBYo_full_DEG_Limma, "Step12_result_macrophagesBYo_full_DEG_Limma.csv")
write.csv(result_macrophagesBYo_FC5.00_DEG_Limma, "Step12_result_macrophagesBYo_FC5.00_DEG_Limma.csv")
summary(result_macrophagesBYo_full_DEG_Limma)
## logFC AveExpr t
## Min. :-11.87828 Min. : 0.06945 Min. :-54.99024
## 1st Qu.: 0.00000 1st Qu.: 0.44096 1st Qu.: 0.00000
## Median : 0.02334 Median : 1.36609 Median : 0.04067
## Mean : 0.24453 Mean : 1.95727 Mean : 0.49308
## 3rd Qu.: 0.49869 3rd Qu.: 3.00784 3rd Qu.: 0.87651
## Max. : 13.44761 Max. :13.10800 Max. : 49.70727
## P.Value adj.P.Val B Contrast
## Min. :0.0000 Min. :0.000 Min. :-6.748 Length:22123
## 1st Qu.:0.2567 1st Qu.:1.000 1st Qu.:-6.748 Class :character
## Median :0.7587 Median :1.000 Median :-6.699 Mode :character
## Mean :0.6244 Mean :0.853 Mean :-5.814
## 3rd Qu.:0.9957 3rd Qu.:1.000 3rd Qu.:-6.081
## Max. :1.0000 Max. :1.000 Max. :49.792
## FC
## Min. : 0.000
## 1st Qu.: 1.000
## Median : 1.016
## Mean : 2.877
## 3rd Qu.: 1.413
## Max. :11172.090
summary(result_macrophagesBYo_FC5.00_DEG_Limma)
## logFC AveExpr t P.Value
## Min. : 2.336 Min. : 0.5893 Min. : 1.364 Min. :0.000e+00
## 1st Qu.: 2.708 1st Qu.: 2.0723 1st Qu.: 2.956 1st Qu.:6.820e-06
## Median : 3.193 Median : 3.0237 Median : 4.028 Median :3.723e-04
## Mean : 3.736 Mean : 3.3980 Mean : 5.211 Mean :9.569e-03
## 3rd Qu.: 4.159 3rd Qu.: 4.3762 3rd Qu.: 5.476 3rd Qu.:6.147e-03
## Max. :13.448 Max. :11.5875 Max. :49.707 Max. :1.832e-01
## adj.P.Val B Contrast
## Min. :0.0000000 Min. :-5.829 Length:545
## 1st Qu.:0.0005627 1st Qu.:-2.860 Class :character
## Median :0.0127163 Median :-0.199 Mode :character
## Mean :0.0817942 Mean : 2.182
## 3rd Qu.:0.0937221 3rd Qu.: 3.688
## Max. :0.8563821 Max. :48.495
## FC
## Min. : 5.049
## 1st Qu.: 6.532
## Median : 9.146
## Mean : 66.890
## 3rd Qu.: 17.864
## Max. :11172.090
#Bind all differentally expressed full genes into a single table
result_allContrasts_full_DEG_Limma<-rbind(result_macrophagesBYo_full_DEG_Limma) #here only one table so rbind not required
dim(result_allContrasts_full_DEG_Limma)
## [1] 22123 8
write.csv(result_allContrasts_full_DEG_Limma, "Step12_result_allContrasts_full_DEG_Limma.csv")
#Bind all differentally expressed pval genes into a single table
result_allContrasts_FC5.00_DEG_Limma<-rbind(result_macrophagesBYo_FC5.00_DEG_Limma) #here only one table so rbind not required
dim(result_allContrasts_FC5.00_DEG_Limma)
## [1] 545 8
write.csv(result_allContrasts_FC5.00_DEG_Limma, "Step12_result_allContrasts_FC5.00_DEG_Limma.csv")
# Q-Q plot of moderated t-statistics
#http://web.mit.edu/~r/current/arch/i386_linux26/lib/R/library/limma/html/lmFit.html
pdf(file="Step13_plot_qqplot_eb_DEG_Limma.pdf",height=10,width=10)
par(mfrow=c(2,2))
qqt(eb_DEG_Limma$t[,"macrophagesBYo"],df_DEG_Limma=eb_DEG_Limma$df.residual+eb_DEG_Limma$df.prior)
## Warning in plot.window(...): "df_DEG_Limma" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "df_DEG_Limma" is not a graphical
## parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "df_DEG_Limma"
## is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "df_DEG_Limma"
## is not a graphical parameter
## Warning in box(...): "df_DEG_Limma" is not a graphical parameter
## Warning in title(...): "df_DEG_Limma" is not a graphical parameter
abline(0,1)
dev.off()
## png
## 2
#https://www.biostars.org/p/203876/
pdf(file="Step13_plot_Volcano_result_adjFC5.00_DEG_Limma.pdf",height=10,width=10)
#par(mfrow=c(2,2))
df<-result_macrophagesBYo_full_DEG_Limma
df$gene<-rownames(result_macrophagesBYo_full_DEG_Limma) #create column for genes
mutateddf <- mutate(df, sig=ifelse(df$adj.P.Val<0.05, "adj.P.Val<0.05", "Not Sig"))
#Will have different colors depending on significance
#volcanoplot with logFC versus adj.P.Val
volc = ggplot(mutateddf, aes(logFC, -log10(adj.P.Val))) +
geom_point(aes(col=sig)) + #add points colored by significance
scale_color_manual(values=c("red", "black")) +
ggtitle("Volcanoplot DEG_Limma")+
geom_text_repel(data=head(mutateddf, 10), aes(label=gene))
#adding text for the top 20 genes
#ggsave("Volcanoplot.jpeg", device="jpeg")
volc
dev.off()
## png
## 2
#https://www.biostars.org/p/203876/
pdf(file="Step13_plot_Volcano_result_adjPValFC_DEG_Limma.pdf",height=10,width=10)
#par(mfrow=c(2,2))
df<-result_macrophagesBYo_full_DEG_Limma
df$gene<-rownames(result_macrophagesBYo_full_DEG_Limma) #create column for genes
mutateddf <- mutate(df, sig=ifelse((df$adj.P.Val < 0.05 & abs(df$logFC) > 2.32), "Sig adj.P.Val&FC", "Not Sig"))
#Will have different colors depending on significance
#volcanoplot with logFC versus adj.P.Val
volc = ggplot(mutateddf, aes(logFC, -log10(adj.P.Val))) +
geom_point(aes(col=sig)) + #add points colored by significance
scale_color_manual(values=c("black", "red")) +
ggtitle("Volcanoplot DEG_Limma")+
geom_text_repel(data=head(mutateddf, 10), aes(label=gene))
#adding text for the top 20 genes
#ggsave("Volcanoplot.jpeg", device="jpeg")
volc
dev.off()
## png
## 2
df<-result_macrophagesBYo_full_DEG_Limma
df$gene<-rownames(result_macrophagesBYo_full_DEG_Limma) #create column for genes
mutateddf <- mutate(df, sig=ifelse((df$adj.P.Val < 0.05 & abs(df$logFC) > 2.32), "Sig adj.P.Val&FC", "Not Sig"))
#show UP100 or in bar plot
mutateddf_UP100 <-mutateddf[order(-mutateddf$logFC), ] #descenting order sort
mutateddf_UP100 <- mutateddf_UP100[1:100,] #top100 upregulated genes
#ggplot colors available http://sape.inf.usi.ch/quick-reference/ggplot2/colour
pdf(file="Step13_plot_barplot_result_adjPValUP100_DEG_Limma.pdf",height=5,width=10)
#Will have different colors depending on significance
bar1<-ggplot(mutateddf_UP100, aes(x=gene, y=logFC, width=.7))+
geom_bar(position="stack", fill="red4", stat="identity")+
labs(title=paste0("Barplot top genes with UP100 with significant adj.P.Val"))+
theme(axis.text.x=element_text(angle=90,hjust=1))
#labs(x="genes")+
#labs(y="fold change")+
#labs(caption="DEG_Limma")+
#scale_fill_gradient(low="black",high="red")+
#theme(plot.margin = margin(2,2,2,2, "cm"), plot.background = element_rect(fill = "white"))
print(bar1)
dev.off()
## png
## 2
edata_keepDEG_Limma_UP100<- edata[which(rownames(edata) %in% mutateddf_UP100$gene), ]
dim(mutateddf_UP100)
## [1] 100 10
dim(edata_keepDEG_Limma_UP100)
## [1] 100 30
pdf(file="Step13_plot_heatmap_result_adjPValUP100_DEG_Limma.pdf",height=15,width=15)
#mapdata<-edata[rownames(edata)==mutateddf$gene,]
mapdata<-edata_keepDEG_Limma_UP100
mapdata_matrix <- data.matrix(mapdata)
colfunc<-colorRampPalette(c("red","green")) #because we are blue group!
hr<-hclust(as.dist(1-cor(t(mapdata_matrix),method="pearson")),method="complete")
plotheatmap<-heatmap.2(mapdata_matrix,Rowv=as.dendrogram(hr),scale="row",density.info="none",trace="none",col=colfunc(256),cexRow=0.4,cexCol=0.4)
dev.off()
## png
## 2
#genes which are sig differentially expressed in contrasts
dim(result_allContrasts_FC5.00_DEG_Limma)
## [1] 545 8
DEG_Limmanames=rownames(result_allContrasts_FC5.00_DEG_Limma)
head(DEG_Limmanames)
## [1] "Cd5l" "Saa3" "Gata6" "Cfb" "Vsig4" "Gbp2b"
length(DEG_Limmanames)
## [1] 545
#save result_allContrasts_full_DEG_Limma, result_allContrasts_FC5.00_DEG_Limma, DEG_Limmanames
save(result_allContrasts_full_DEG_Limma, result_allContrasts_FC5.00_DEG_Limma, DEG_Limmanames, file="DEG_Limma_data.RData")
#Keep only those genes which are differentially expressed in contrasts
dim(edata)
## [1] 22123 30
edata_keepDEG_Limma<- edata[which(rownames(edata) %in% DEG_Limmanames), ]
dim(edata_keepDEG_Limma)
## [1] 545 30
#unique and common genes between the different contrasts DEG_Limma are retained
DT::datatable(edata_keepDEG_Limma[1:7,1:7])
#Export edata keepDEG_Limma genes only
write.csv(edata_keepDEG_Limma, "Step14_result_edata_keepDEG_Limma.csv")
edata_keepDEG_Limma filtered to keepDEG_Limma genes
#Summary Statistics edata_keepDEG_Limma
DT::datatable(summary(edata_keepDEG_Limma[1:7,1:7]))
write.csv(summary(edata_keepDEG_Limma),"Step14_result_summary_stats_edata_keepDEG_Limma.csv")
pdf(file="Step14_plot_Visualization_boxplot_MDS_PC_keepDEG_Limma_for_edata.pdf",height=10,width=10)
par(mfrow=c(2,1))
############set colors for chunk Study###############
nconditions <- nlevels(as.factor(pheno$Study))
npal <- colorRampPalette(brewer.pal(nconditions, "Set1"))(nconditions)
condition_colors <- npal[as.integer(pheno$Study)]
############boxplot chunk###############
#Before boxplot edata_keepDEG_Limma
par(mai=c(1,0.8,1,0.8))
boxplot(edata_keepDEG_Limma, outline=FALSE, las=2, cex=0.25, main="edata_keepDEG_Limma", col="yellow")
############MDSplot chunk###############
#Before MDSplot edata_keepDEG_Limma
par(mai=c(1,0.8,1,0.8))
plotMDS(edata_keepDEG_Limma, col=condition_colors, legend= "all", main="edata_keepDEG_Limma", cex=0.5)#like PCA plot
## Warning in plot.window(...): "legend" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "legend" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "legend" is
## not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "legend" is
## not a graphical parameter
## Warning in box(...): "legend" is not a graphical parameter
## Warning in title(...): "legend" is not a graphical parameter
## Warning in text.default(x$x, x$y, labels = labels, cex = cex, ...):
## "legend" is not a graphical parameter
#######PCAplot chunk CellType colored by Study or batch########
#Before PCAplot edata_keepDEG_Limma, colored by Study or batch, labels CellType
pca0=prcomp(t(edata_keepDEG_Limma), center=T, scale=T)
names(pca0)
## [1] "sdev" "rotation" "center" "scale" "x"
#summary(pca0)
pca0$x[1:3,1:3]
## PC1 PC2 PC3
## astro_SRR1033783 -7.434513 -11.030519 9.07007792
## astro_SRR1033784 -6.086374 -12.015319 10.18763583
## endo_SRR1033795 -9.439329 -2.863454 0.00307252
plot(x=pca0$x[,1], y=pca0$x[,2],
cex=0,
xlab="PC1", ylab="PC2",
main="PC plot colored by Study edata_keepDEG_Limma",
font=2
)
text(x=pca0$x[,1], y=pca0$x[,2],
labels=metadata_1$CellType,
col=as.numeric(metadata_1$Study),
cex=0.7, font=2)
legend("bottomright",
legend=c("GSE52564_to_SRP035309"),
text.col=c("red", "blue", "green")
)
#########PCAplot chunk Species colored by Study or batch######
#Before PCAplot edata_keepDEG_Limma, colored by Study or batch, labels Species
pca0=prcomp(t(edata_keepDEG_Limma), center=T, scale=T)
names(pca0)
## [1] "sdev" "rotation" "center" "scale" "x"
#summary(pca0)
pca0$x[1:3,1:3]
## PC1 PC2 PC3
## astro_SRR1033783 -7.434513 -11.030519 9.07007792
## astro_SRR1033784 -6.086374 -12.015319 10.18763583
## endo_SRR1033795 -9.439329 -2.863454 0.00307252
plot(x=pca0$x[,1], y=pca0$x[,2],
cex=0,
xlab="PC1", ylab="PC2",
main="PC plot colored by Study edata_keepDEG_Limma",
font=2
)
text(x=pca0$x[,1], y=pca0$x[,2],
labels=metadata_1$Species,
col=as.numeric(metadata_1$Study),
cex=0.7, font=2)
legend("bottomright",
legend=c("GSE52564_to_SRP035309"),
text.col=c("red", "blue", "green")
)
#########PCAplot chunk Age colored by Study or batch########
#Before PCAplot edata_keepDEG_Limma, colored by Study or batch, labels Age
pca0=prcomp(t(edata_keepDEG_Limma), center=T, scale=T)
names(pca0)
## [1] "sdev" "rotation" "center" "scale" "x"
#summary(pca0)
pca0$x[1:3,1:3]
## PC1 PC2 PC3
## astro_SRR1033783 -7.434513 -11.030519 9.07007792
## astro_SRR1033784 -6.086374 -12.015319 10.18763583
## endo_SRR1033795 -9.439329 -2.863454 0.00307252
plot(x=pca0$x[,1], y=pca0$x[,2],
cex=0,
xlab="PC1", ylab="PC2",
main="PC plot colored by Study edata_keepDEG_Limma",
font=2
)
text(x=pca0$x[,1], y=pca0$x[,2],
labels=metadata_1$Age,
col=as.numeric(metadata_1$Study),
cex=0.7, font=2)
legend("bottomright",
legend=c("GSE52564_to_SRP035309"),
text.col=c("red", "blue", "green")
)
dev.off()
## png
## 2
t_edata_keepDEG_Limma<-t(edata_keepDEG_Limma)
DT::datatable(t_edata_keepDEG_Limma[,1:7])
DT::datatable(pheno)
dim(t_edata_keepDEG_Limma)
## [1] 30 545
dim(pheno)
## [1] 30 5
#bind the pheno and edata_keepDEG_Limma, this file can be used to predict trait based on a particular gene's gene expression
pheno_t_edata_keepDEG_Limma<-cbind(pheno, t_edata_keepDEG_Limma)
dim(pheno_t_edata_keepDEG_Limma)
## [1] 30 550
DT::datatable(pheno_t_edata_keepDEG_Limma[,1:10])
write.csv(pheno_t_edata_keepDEG_Limma, "Step14_result_pheno_t_edata_keepDEG_Limma_All_Group_aggregate.csv")
#select the CellType pheno drop others
pheno_t_edata_keepDEG_Limma_CellType=pheno_t_edata_keepDEG_Limma[!names(pheno_t_edata_keepDEG_Limma) %in% c("Age", "Study", "Species", "Tissue")]
dim(pheno_t_edata_keepDEG_Limma_CellType)
## [1] 30 546
DT::datatable(pheno_t_edata_keepDEG_Limma_CellType[,1:10])
#aggregate edata_keepDEG_Limma or expression for the CellType pheno
#https://campus.datacamp.com/courses/data-table-data-manipulation-r-tutorial/chapter-two-datatable-yeoman?ex=6
library(data.table)
setDT(pheno_t_edata_keepDEG_Limma_CellType)
pheno_t_edata_keepDEG_Limma_CellType_avg=pheno_t_edata_keepDEG_Limma_CellType[, lapply(.SD, mean), by = .(CellType)]
DT::datatable(pheno_t_edata_keepDEG_Limma_CellType_avg[,1:10])
#transpose genes to rows and variable to column
t_pheno_t_edata_keepDEG_Limma_CellType_avg=t(pheno_t_edata_keepDEG_Limma_CellType_avg[,-1])
colnames(t_pheno_t_edata_keepDEG_Limma_CellType_avg)=pheno_t_edata_keepDEG_Limma_CellType_avg$CellType
dim(t_pheno_t_edata_keepDEG_Limma_CellType_avg)
## [1] 545 2
DT::datatable(t_pheno_t_edata_keepDEG_Limma_CellType_avg[1:10,])
write.csv(t_pheno_t_edata_keepDEG_Limma_CellType_avg, "Step14_result_edata_keepDEG_Limma_CellType_aggregate.csv")
#select the Species pheno drop others
pheno_t_edata_keepDEG_Limma_Species=pheno_t_edata_keepDEG_Limma[!names(pheno_t_edata_keepDEG_Limma) %in% c("Age", "Study", "CellType", "Tissue")]
dim(pheno_t_edata_keepDEG_Limma_Species)
## [1] 30 546
DT::datatable(pheno_t_edata_keepDEG_Limma_Species[,1:10])
#aggregate edata_keepDEG_Limma or expression for the Species pheno
#https://campus.datacamp.com/courses/data-table-data-manipulation-r-tutorial/chapter-two-datatable-yeoman?ex=6
library(data.table)
setDT(pheno_t_edata_keepDEG_Limma_Species)
pheno_t_edata_keepDEG_Limma_Species_avg=pheno_t_edata_keepDEG_Limma_Species[, lapply(.SD, mean), by = .(Species)]
DT::datatable(pheno_t_edata_keepDEG_Limma_Species_avg[,1:10])
#transpose genes to rows and variable to column
t_pheno_t_edata_keepDEG_Limma_Species_avg=t(pheno_t_edata_keepDEG_Limma_Species_avg[,-1])
colnames(t_pheno_t_edata_keepDEG_Limma_Species_avg)=pheno_t_edata_keepDEG_Limma_Species_avg$Species
dim(t_pheno_t_edata_keepDEG_Limma_Species_avg)
## [1] 545 2
DT::datatable(t_pheno_t_edata_keepDEG_Limma_Species_avg[1:10,])
write.csv(t_pheno_t_edata_keepDEG_Limma_Species_avg, "Step14_result_edata_keepDEG_Limma_Species_aggregate.csv")
#select the Age pheno drop others
pheno_t_edata_keepDEG_Limma_Age=pheno_t_edata_keepDEG_Limma[!names(pheno_t_edata_keepDEG_Limma) %in% c("CellType", "Study", "Species", "Tissue")]
dim(pheno_t_edata_keepDEG_Limma_Age)
## [1] 30 546
DT::datatable(pheno_t_edata_keepDEG_Limma_Age[,1:10])
#aggregate edata_keepDEG_Limma or expression for the Age pheno
#https://campus.datacamp.com/courses/data-table-data-manipulation-r-tutorial/chapter-two-datatable-yeoman?ex=6
library(data.table)
setDT(pheno_t_edata_keepDEG_Limma_Age)
pheno_t_edata_keepDEG_Limma_Age_avg=pheno_t_edata_keepDEG_Limma_Age[, lapply(.SD, mean), by = .(Age)]
DT::datatable(pheno_t_edata_keepDEG_Limma_Age_avg[,1:10])
#transpose genes to rows and variable to column
t_pheno_t_edata_keepDEG_Limma_Age_avg=t(pheno_t_edata_keepDEG_Limma_Age_avg[,-1])
colnames(t_pheno_t_edata_keepDEG_Limma_Age_avg)=pheno_t_edata_keepDEG_Limma_Age_avg$Age
dim(t_pheno_t_edata_keepDEG_Limma_Age_avg)
## [1] 545 2
DT::datatable(t_pheno_t_edata_keepDEG_Limma_Age_avg[1:10,])
write.csv(t_pheno_t_edata_keepDEG_Limma_Age_avg, "Step14_result_edata_keepDEG_Limma_Age_aggregate.csv")
#select the Tissue pheno drop others
pheno_t_edata_keepDEG_Limma_Tissue=pheno_t_edata_keepDEG_Limma[!names(pheno_t_edata_keepDEG_Limma) %in% c("CellType", "Study", "Species", "Age")]
dim(pheno_t_edata_keepDEG_Limma_Tissue)
## [1] 30 546
DT::datatable(pheno_t_edata_keepDEG_Limma_Tissue[,1:10])
#aggregate edata_keepDEG_Limma or expression for the Tissue pheno
#https://campus.datacamp.com/courses/data-table-data-manipulation-r-tutorial/chapter-two-datatable-yeoman?ex=6
library(data.table)
setDT(pheno_t_edata_keepDEG_Limma_Tissue)
pheno_t_edata_keepDEG_Limma_Tissue_avg=pheno_t_edata_keepDEG_Limma_Tissue[, lapply(.SD, mean), by = .(Tissue)]
DT::datatable(pheno_t_edata_keepDEG_Limma_Tissue_avg[,1:10])
#transpose genes to rows and variable to column
t_pheno_t_edata_keepDEG_Limma_Tissue_avg=t(pheno_t_edata_keepDEG_Limma_Tissue_avg[,-1])
colnames(t_pheno_t_edata_keepDEG_Limma_Tissue_avg)=pheno_t_edata_keepDEG_Limma_Tissue_avg$Tissue
dim(t_pheno_t_edata_keepDEG_Limma_Tissue_avg)
## [1] 545 4
DT::datatable(t_pheno_t_edata_keepDEG_Limma_Tissue_avg[1:10,])
write.csv(t_pheno_t_edata_keepDEG_Limma_Tissue_avg, "Step14_result_edata_keepDEG_Limma_Tissue_aggregate.csv")
t_pheno_t_edata_keepDEG_Limma_all_avg<-cbind(t_pheno_t_edata_keepDEG_Limma_CellType_avg, t_pheno_t_edata_keepDEG_Limma_Species_avg, t_pheno_t_edata_keepDEG_Limma_Age_avg, t_pheno_t_edata_keepDEG_Limma_Tissue_avg)
DT::datatable(head(t_pheno_t_edata_keepDEG_Limma_all_avg))
write.csv(t_pheno_t_edata_keepDEG_Limma_all_avg, "Step14_result_edata_keepDEG_Limma_All_Group_aggregate.csv")
#save pheno_t_edata_keepDEG_Limma, t_pheno_t_edata_keepDEG_Limma_CellType_avg, t_pheno_t_edata_keepDEG_Limma_Species_avg, t_pheno_t_edata_keepDEG_Limma_Age_avg, t_pheno_t_edata_keepDEG_Limma_Tissue_avg, t_pheno_t_edata_keepDEG_Limma_all_avg
save(pheno_t_edata_keepDEG_Limma, t_pheno_t_edata_keepDEG_Limma_CellType_avg, t_pheno_t_edata_keepDEG_Limma_Species_avg, t_pheno_t_edata_keepDEG_Limma_Age_avg, t_pheno_t_edata_keepDEG_Limma_Tissue_avg, t_pheno_t_edata_keepDEG_Limma_all_avg, file="edata_keepDEG_Limma_aggregate_data.RData")
pdf(file="Step14_plot_boxplot_corr_plot_edata_keepDEG_Limma_aka_expr_aggregate.pdf",height=10,width=10)
par(mfrow=c(2,1))
#########boxplot CellType Species Age Tissue and all ########
#box plot on aggregate CellType
boxplot(t_pheno_t_edata_keepDEG_Limma_CellType_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_keepDEG_Limma_CellType_avg", col="yellow")
#box plot on aggregate Species
boxplot(t_pheno_t_edata_keepDEG_Limma_Species_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_keepDEG_Limma_Species_avg", col="yellow")
#box plot on aggregate Age
boxplot(t_pheno_t_edata_keepDEG_Limma_Age_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_keepDEG_Limma_Age_avg", col="yellow")
#box plot on aggregate Tissue
boxplot(t_pheno_t_edata_keepDEG_Limma_Tissue_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_keepDEG_Limma_Tissue_avg", col="yellow")
#box plot on aggregate all
boxplot(t_pheno_t_edata_keepDEG_Limma_all_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_keepDEG_Limma_all_avg", col="yellow")
#########corr plot CellType Species Age Tissue and all ########
#create cor matrix on aggregate CellType
rel2 <- rcorr(as.matrix(t_pheno_t_edata_keepDEG_Limma_CellType_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step14_result_edata_keepDEG_Limma_CellType_Group_corr_pval_aggregate.csv")
#create cor matrix on aggregate Species
rel2 <- rcorr(as.matrix(t_pheno_t_edata_keepDEG_Limma_Species_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step14_result_edata_keepDEG_Limma_Species_Group_corr_pval_aggregate.csv")
#create cor matrix on aggregate Age
rel2 <- rcorr(as.matrix(t_pheno_t_edata_keepDEG_Limma_Age_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step14_result_edata_keepDEG_Limma_Age_Group_corr_pval_aggregate.csv")
#create cor matrix on aggregate Tissue
rel2 <- rcorr(as.matrix(t_pheno_t_edata_keepDEG_Limma_Tissue_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step14_result_edata_keepDEG_Limma_Tissue_Group_corr_pval_aggregate.csv")
#create cor matrix on aggregate all
rel2 <- rcorr(as.matrix(t_pheno_t_edata_keepDEG_Limma_all_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step14_result_edata_keepDEG_Limma_all_Group_corr_pval_aggregate.csv")
dev.off()
## png
## 2
pdf(file="Step14_plot_density_edata_keepDEG_Limma_aka_expr_aggregate.pdf",height=10,width=10)
#########density plot CellType Species Age Tissue and all ########
#density plot on aggregate CellType
df=as.data.frame(t_pheno_t_edata_keepDEG_Limma_CellType_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_keepDEG_Limma_CellType_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) +
geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
labs(title="gene expression density curve for CellType",x="t_pheno_t_edata_keepDEG_Limma_CellType_avg", y = "Density")
#density plot on aggregate Species
df=as.data.frame(t_pheno_t_edata_keepDEG_Limma_Species_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_keepDEG_Limma_Species_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) +
geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
labs(title="gene expression density curve for Species",x="t_pheno_t_edata_keepDEG_Limma_Species_avg", y = "Density")
#density plot on aggregate Age
df=as.data.frame(t_pheno_t_edata_keepDEG_Limma_Age_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_keepDEG_Limma_Age_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) +
geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
labs(title="gene expression density curve for Age",x="t_pheno_t_edata_keepDEG_Limma_Age_avg", y = "Density")
#density plot on aggregate Tissue
df=as.data.frame(t_pheno_t_edata_keepDEG_Limma_Tissue_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_keepDEG_Limma_Tissue_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) +
geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
labs(title="gene expression density curve for Tissue",x="t_pheno_t_edata_keepDEG_Limma_Tissue_avg", y = "Density")
#density plot on aggregate all
df=as.data.frame(t_pheno_t_edata_keepDEG_Limma_all_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_keepDEG_Limma_all_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) +
geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
labs(title="gene expression density curve for all",x="t_pheno_t_edata_keepDEG_Limma_all_avg", y = "Density")
dev.off()
## png
## 2
#save pheno, edata
save(pheno, metadata_1, edata, edata_keepDEG_Limma, file="edata_keepDEG_Limma_data.RData")
#save image
save.image(file="DEG_Limma_temp.RData")
rm(list=ls())
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3991873 213.2 8074265 431.3 8074265 431.3
## Vcells 7471722 57.1 24573344 187.5 30716424 234.4
#Load pheno, edata required for steps below
load(file="done_pre_Step8.RData")
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4191795 223.9 8074265 431.3 8074265 431.3
## Vcells 16865709 128.7 29568012 225.6 30716424 234.4
#load required libraries
#source("https://bioconductor.org/biocLite.R")
#biocLite(c("sva","limma","bladderbatch","pamr"))
library(limma)
library(sva)
library(pamr)
library(ggplot2) # plot
library(ggmap)#plots
library(gplots)#plots
library(ggrepel) #Avoid overlapping labels
library(ggmap)#plots
library(gplots)#plots
library(RColorBrewer)#color pallet
library(Hmisc)#for corelation plot
library(viridis)#for corelation plot
library(corrplot)#for corelation plot
library(dplyr)
colnames(edata)==rownames(pheno)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [15] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [29] TRUE TRUE
#This should be all TRUE
#DT::datatable(edata[1:3,1:3])
edata[1:3,1:3]
## astro_SRR1033783 astro_SRR1033784 endo_SRR1033795
## 0610009B22Rik 4.955948 4.845806 4.4090832
## 0610009E02Rik 1.459870 1.113146 0.3603799
## 0610009L18Rik 4.215835 3.674934 1.6072611
#From here identify column numbers of interest for next step
head(pheno)
## CellType Age Study Species
## astro_SRR1033783 other No_data GSE52564 mouse_FVN_Swiss
## astro_SRR1033784 other No_data GSE52564 mouse_FVN_Swiss
## endo_SRR1033795 other No_data GSE52564 mouse_FVN_Swiss
## endo_SRR1033796 other No_data GSE52564 mouse_FVN_Swiss
## macrophages_SRR1106141 macrophages 5_months SRP035309 mouse_C57BL6
## macrophages_SRR1106142 macrophages 5_months SRP035309 mouse_C57BL6
## Tissue
## astro_SRR1033783 cerebral_cortex
## astro_SRR1033784 cerebral_cortex
## endo_SRR1033795 cerebral_cortex
## endo_SRR1033796 cerebral_cortex
## macrophages_SRR1106141 peritoneum
## macrophages_SRR1106142 peritoneum
#Create index to subset expression data to find the differentially expressed genes .
index_O=which(pheno$CellType=='other')
index_AS=which(pheno$CellType=='macrophages')
#subset expression data to find the differentially expressed genes .
edata_macrophagesBYo<-edata[,c(index_AS,index_O)]
head(edata_macrophagesBYo)
## macrophages_SRR1106141 macrophages_SRR1106142
## 0610009B22Rik 5.602279 5.4082505
## 0610009E02Rik 0.000000 0.3667327
## 0610009L18Rik 2.906423 2.5190074
## 0610009O20Rik 3.385650 3.8842368
## 0610010F05Rik 1.206407 1.2885642
## 0610012G03Rik 9.115554 8.9263649
## macrophages_SRR1106143 macrophages_SRR1106169
## 0610009B22Rik 5.4189456 5.902053
## 0610009E02Rik 0.0000000 0.000000
## 0610009L18Rik 3.2384505 3.868483
## 0610009O20Rik 2.8918854 3.441345
## 0610010F05Rik 0.7934928 1.189148
## 0610012G03Rik 9.2219219 9.003139
## macrophages_SRR1106186 macrophages_SRR1106188
## 0610009B22Rik 5.4959670 5.6169046
## 0610009E02Rik 0.3710340 0.0000000
## 0610009L18Rik 3.1685405 2.7843654
## 0610009O20Rik 3.3645188 3.5556140
## 0610010F05Rik 0.8316789 0.9867346
## 0610012G03Rik 8.8119698 9.0534257
## astro_SRR1033783 astro_SRR1033784 endo_SRR1033795
## 0610009B22Rik 4.955948 4.845806 4.4090832
## 0610009E02Rik 1.459870 1.113146 0.3603799
## 0610009L18Rik 4.215835 3.674934 1.6072611
## 0610009O20Rik 6.310704 6.183138 4.8231150
## 0610010F05Rik 2.772554 2.804534 1.8905326
## 0610012G03Rik 6.950474 6.846606 6.9670158
## endo_SRR1033796 microglia_SRR1033793 microglia_SRR1033794
## 0610009B22Rik 4.502667 2.0202852 2.6798393
## 0610009E02Rik 0.243692 0.5107171 0.2378286
## 0610009L18Rik 1.952993 3.7203000 3.0468366
## 0610009O20Rik 4.848143 5.2641770 5.5836801
## 0610010F05Rik 1.971668 0.3812806 0.3039537
## 0610012G03Rik 7.052615 7.3788153 7.3411645
## microglia_SRR1106118 microglia_SRR1106122
## 0610009B22Rik 5.6552688 5.8137871
## 0610009E02Rik 0.1054247 0.0000000
## 0610009L18Rik 2.5747633 3.5681973
## 0610009O20Rik 3.3889920 3.5025661
## 0610010F05Rik 0.2031030 0.3484909
## 0610012G03Rik 7.7680973 8.0522739
## microglia_SRR5483445 microglia_SRR5483446
## 0610009B22Rik 6.581153 6.7364702
## 0610009E02Rik 1.046593 0.6805091
## 0610009L18Rik 4.507416 4.7608643
## 0610009O20Rik 3.645392 3.8413100
## 0610010F05Rik 2.591149 2.4517537
## 0610012G03Rik 7.182036 7.2595420
## microglia_SRR5483447 monocytes_SRR5483448
## 0610009B22Rik 6.6051291 5.0429266
## 0610009E02Rik 0.7372848 0.8453395
## 0610009L18Rik 4.8392993 3.9059140
## 0610009O20Rik 3.7905855 2.1779053
## 0610010F05Rik 2.6293385 2.7444755
## 0610012G03Rik 7.2184996 7.2559842
## monocytes_SRR5483449 monocytes_SRR5483450
## 0610009B22Rik 4.6151396 4.972788
## 0610009E02Rik 0.8159736 1.407900
## 0610009L18Rik 3.9546957 3.179912
## 0610009O20Rik 2.2899918 2.278137
## 0610010F05Rik 2.6264154 2.507282
## 0610012G03Rik 7.2598248 7.524382
## monocytes_SRR5483451 monocytes_SRR5483452
## 0610009B22Rik 4.2654510 3.974523
## 0610009E02Rik 0.5081903 1.011250
## 0610009L18Rik 2.9825106 2.272594
## 0610009O20Rik 2.1253128 2.210888
## 0610010F05Rik 2.8541869 3.010300
## 0610012G03Rik 6.7149669 6.625560
## myeoligo_SRR1033791 myeoligo_SRR1033792 neuron_SRR1033785
## 0610009B22Rik 5.13165355 4.9628127 5.074496
## 0610009E02Rik 0.03926298 0.2194104 1.179335
## 0610009L18Rik 2.59376022 3.4391757 3.614653
## 0610009O20Rik 4.77915518 4.9189658 5.480809
## 0610010F05Rik 1.62141301 1.7297040 3.689563
## 0610012G03Rik 7.41307069 7.4875229 7.607235
## neuron_SRR1033786 newoligo_SRR1033789 newoligo_SRR1033790
## 0610009B22Rik 5.031467 5.4321841 5.1954989
## 0610009E02Rik 1.716639 0.5480982 0.8470736
## 0610009L18Rik 3.510926 3.8301179 4.3732173
## 0610009O20Rik 5.352959 5.4502942 5.5312406
## 0610010F05Rik 4.010739 3.0207377 1.8599674
## 0610012G03Rik 7.540919 7.3616935 7.8424973
## preoligo_SRR1033787 preoligo_SRR1033788
## 0610009B22Rik 5.1328847 4.9719015
## 0610009E02Rik 0.6893239 0.4045672
## 0610009L18Rik 3.8282169 4.5291014
## 0610009O20Rik 5.8388370 6.0930914
## 0610010F05Rik 2.8053688 1.6266376
## 0610012G03Rik 7.1233211 8.0684096
#Differentially expressed genes between "macrophagesBYo"
t_macrophagesBYo<-apply(edata_macrophagesBYo,1,function(x){
mod<-t.test(x[index_O],x[index_AS])
meanx<-mean(2^x[index_O]-1) #because we had log2+1
meany<-mean(2^x[index_AS]-1)#because we had log2+1
tvalue<-mod[1]$statistic[[1]]
FC<-meany/meanx
logFC<-log2(FC)
pvalue<-mod[3]$p.value
Contrast<-"macrophagesBYo"
c(meanx=meanx,meany=meany,tvalue=tvalue,FC=FC,logFC=logFC, pvalue=pvalue, Contrast=Contrast)
})
t_macrophagesBYo<-t(t_macrophagesBYo)
adj.P.Val<-p.adjust(t_macrophagesBYo[,'pvalue'], method="BH")
#DEG_Simple for contrast "macrophagesBYo"
result_macrophagesBYo_full_DEG_Simple<-cbind(t_macrophagesBYo,adj.P.Val)
result_macrophagesBYo_full_DEG_Simple<-as.data.frame(result_macrophagesBYo_full_DEG_Simple)
cols = c("meanx","meany", "tvalue", "FC", "logFC", "pvalue", "adj.P.Val")
result_macrophagesBYo_full_DEG_Simple[,cols] = apply(result_macrophagesBYo_full_DEG_Simple[,cols], 2, function(x) as.numeric(as.character(x)))
result_macrophagesBYo_full_DEG_Simple = result_macrophagesBYo_full_DEG_Simple[order(-result_macrophagesBYo_full_DEG_Simple$logFC), ] #order by descending fold change
result_macrophagesBYo_FC5.00_DEG_Simple<- result_macrophagesBYo_full_DEG_Simple[which(result_macrophagesBYo_full_DEG_Simple$FC >= 5.00), ]
dim(result_macrophagesBYo_full_DEG_Simple)
## [1] 22123 8
dim(result_macrophagesBYo_FC5.00_DEG_Simple)
## [1] 1036 8
write.csv(result_macrophagesBYo_full_DEG_Simple, "Step15_result_macrophagesBYo_full_DEG_Simple.csv")
write.csv(result_macrophagesBYo_FC5.00_DEG_Simple, "Step15_result_macrophagesBYo_FC5.00_DEG_Simple.csv")
summary(result_macrophagesBYo_full_DEG_Simple)
## meanx meany tvalue FC
## Min. : 0.00 Min. : 0.00 Min. :-5.9272 Min. :0.0000
## 1st Qu.: 0.62 1st Qu.: 0.21 1st Qu.:-0.4983 1st Qu.:0.1878
## Median : 3.24 Median : 2.59 Median : 0.3996 Median :0.7514
## Mean : 45.14 Mean : 45.18 Mean : 0.5792 Mean : Inf
## 3rd Qu.: 14.65 3rd Qu.: 13.51 3rd Qu.: 1.6470 3rd Qu.:1.3110
## Max. :32000.54 Max. :32399.63 Max. : 7.9696 Max. : Inf
## logFC pvalue Contrast
## Min. : -Inf Min. :0.00000 macrophagesBYo:22123
## 1st Qu.:-2.4126 1st Qu.:0.09135
## Median :-0.4123 Median :0.33644
## Mean : NaN Mean :0.38997
## 3rd Qu.: 0.3906 3rd Qu.:0.66678
## Max. : Inf Max. :0.99999
## adj.P.Val
## Min. :0.0002494
## 1st Qu.:0.3653350
## Median :0.6727763
## Mean :0.6107720
## 3rd Qu.:0.8888960
## Max. :0.9999950
summary(result_macrophagesBYo_FC5.00_DEG_Simple)
## meanx meany tvalue FC
## Min. : 0.0000 Min. : 0.253 Min. :-5.8009 Min. : 5.002
## 1st Qu.: 0.1686 1st Qu.: 2.080 1st Qu.:-1.8087 1st Qu.: 6.590
## Median : 0.6082 Median : 8.133 Median :-1.4198 Median :10.254
## Mean : 5.3510 Mean : 69.004 Mean :-1.5236 Mean : Inf
## 3rd Qu.: 2.1274 3rd Qu.: 30.080 3rd Qu.:-1.1195 3rd Qu.:21.132
## Max. :381.8397 Max. :6932.950 Max. :-0.1661 Max. : Inf
## logFC pvalue Contrast
## Min. :2.323 Min. :0.0000736 macrophagesBYo:1036
## 1st Qu.:2.720 1st Qu.:0.1277629
## Median :3.358 Median :0.2129424
## Mean : Inf Mean :0.2321935
## 3rd Qu.:4.401 3rd Qu.:0.3112067
## Max. : Inf Max. :0.8736574
## adj.P.Val
## Min. :0.0185
## 1st Qu.:0.4334
## Median :0.5312
## Mean :0.5316
## 3rd Qu.:0.6483
## Max. :0.9632
#Bind all differentally expressed full genes into a single table
result_allContrasts_full_DEG_Simple<-rbind(result_macrophagesBYo_full_DEG_Simple) #here only one table so rbind not required
dim(result_allContrasts_full_DEG_Simple)
## [1] 22123 8
write.csv(result_allContrasts_full_DEG_Simple, "Step15_result_allContrasts_full_DEG_Simple.csv")
#Bind all differentally expressed pval genes into a single table
result_allContrasts_FC5.00_DEG_Simple<-rbind(result_macrophagesBYo_FC5.00_DEG_Simple)#here only one table so rbind not required
dim(result_allContrasts_FC5.00_DEG_Simple)
## [1] 1036 8
write.csv(result_allContrasts_FC5.00_DEG_Simple, "Step15_result_allContrasts_FC5.00_DEG_Simple.csv")
#https://www.biostars.org/p/203876/
pdf(file="Step15_plot_Volcano_result_adjFC5.00_DEG_Simple.pdf",height=10,width=10)
#par(mfrow=c(2,2))
df<-result_macrophagesBYo_full_DEG_Simple
df$gene<-rownames(result_macrophagesBYo_full_DEG_Simple) #create column for genes
mutateddf <- mutate(df, sig=ifelse(df$adj.P.Val<0.05, "adj.P.Val<0.05", "Not Sig"))
#Will have different colors depending on significance
#volcanoplot with logFC versus adj.P.Val
volc = ggplot(mutateddf, aes(logFC, -log10(adj.P.Val))) +
geom_point(aes(col=sig)) + #add points colored by significance
scale_color_manual(values=c("red", "black")) +
ggtitle("Volcanoplot DEG_Simple")+
geom_text_repel(data=head(mutateddf, 10), aes(label=gene))
#adding text for the top 20 genes
#ggsave("Volcanoplot.jpeg", device="jpeg")
volc
dev.off()
## png
## 2
#https://www.biostars.org/p/203876/
pdf(file="Step15_plot_Volcano_result_adjPValFC_DEG_Simple.pdf",height=10,width=10)
#par(mfrow=c(2,2))
df<-result_macrophagesBYo_full_DEG_Simple
df$gene<-rownames(result_macrophagesBYo_full_DEG_Simple) #create column for genes
mutateddf <- mutate(df, sig=ifelse((df$adj.P.Val < 0.05 & abs(df$logFC) > 2.32), "Sig adj.P.Val&FC", "Not Sig"))
#Will have different colors depending on significance
#volcanoplot with logFC versus adj.P.Val
volc = ggplot(mutateddf, aes(logFC, -log10(adj.P.Val))) +
geom_point(aes(col=sig)) + #add points colored by significance
scale_color_manual(values=c("black", "red")) +
ggtitle("Volcanoplot DEG_Simple")+
geom_text_repel(data=head(mutateddf, 10), aes(label=gene))
#adding text for the top 20 genes
#ggsave("Volcanoplot.jpeg", device="jpeg")
volc
dev.off()
## png
## 2
df<-result_macrophagesBYo_full_DEG_Simple
df$gene<-rownames(result_macrophagesBYo_full_DEG_Simple) #create column for genes
mutateddf <- mutate(df, sig=ifelse((df$adj.P.Val < 0.05 & abs(df$logFC) > 2.32), "Sig adj.P.Val&FC", "Not Sig"))
#show UP100 or in bar plot
mutateddf_UP100 <-mutateddf[order(-mutateddf$logFC), ] #descenting order sort
mutateddf_UP100 <- mutateddf_UP100[1:100,] #top100 upregulated genes
#ggplot colors available http://sape.inf.usi.ch/quick-reference/ggplot2/colour
pdf(file="Step15_plot_barplot_result_adjPValUP100_DEG_Simple.pdf",height=5,width=10)
#Will have different colors depending on significance
bar1<-ggplot(mutateddf_UP100, aes(x=gene, y=logFC, width=.7))+
geom_bar(position="stack", fill="red4", stat="identity")+
labs(title=paste0("Barplot top genes with UP100 with significant adj.P.Val"))+
theme(axis.text.x=element_text(angle=90,hjust=1))
#labs(x="genes")+
#labs(y="fold change")+
#labs(caption="DEG_Simple")+
#scale_fill_gradient(low="black",high="red")+
#theme(plot.margin = margin(2,2,2,2, "cm"), plot.background = element_rect(fill = "white"))
print(bar1)
dev.off()
## png
## 2
edata_keepDEG_Simple_UP100<- edata[which(rownames(edata) %in% mutateddf_UP100$gene), ]
dim(mutateddf_UP100)
## [1] 100 10
dim(edata_keepDEG_Simple_UP100)
## [1] 100 30
pdf(file="Step15_plot_heatmap_result_adjPValUP100_DEG_Simple.pdf",height=15,width=15)
#mapdata<-edata[rownames(edata)==mutateddf$gene,]
mapdata<-edata_keepDEG_Simple_UP100
mapdata_matrix <- data.matrix(mapdata)
colfunc<-colorRampPalette(c("red","green")) #because we are blue group!
hr<-hclust(as.dist(1-cor(t(mapdata_matrix),method="pearson")),method="complete")
plotheatmap<-heatmap.2(mapdata_matrix,Rowv=as.dendrogram(hr),scale="row",density.info="none",trace="none",col=colfunc(256),cexRow=0.4,cexCol=0.4)
dev.off()
## png
## 2
#genes which are sig differentially expressed in contrasts
dim(result_allContrasts_FC5.00_DEG_Simple)
## [1] 1036 8
DEG_Simplenames=rownames(result_allContrasts_FC5.00_DEG_Simple)
head(DEG_Simplenames)
## [1] "2010109P13Rik" "Gm13450" "Gm22004" "Gm22740"
## [5] "Gm23675" "Gm23780"
length(DEG_Simplenames)
## [1] 1036
#save result_allContrasts_full_DEG_Simple, result_allContrasts_FC5.00_DEG_Simple, DEG_Simplenames
save(result_allContrasts_full_DEG_Simple, result_allContrasts_FC5.00_DEG_Simple, DEG_Simplenames, file="DEG_Simple_data.RData")
#Keep only those genes which are differentially expressed in contrasts
dim(edata)
## [1] 22123 30
edata_keepDEG_Simple<- edata[which(rownames(edata) %in% DEG_Simplenames), ]
dim(edata_keepDEG_Simple)
## [1] 1036 30
#unique and common genes between the different contrasts DEG_Simple are retained
DT::datatable(edata_keepDEG_Simple[1:7,1:7])
#Export edata keepDEG_Simple genes only
write.csv(edata_keepDEG_Simple, "Step16_result_edata_keepDEG_Simple.csv")
edata_keepDEG_Simple filtered to keepDEG_Simple genes
#Summary Statistics edata_keepDEG_Simple
DT::datatable(summary(edata_keepDEG_Simple[1:7,1:7]))
write.csv(summary(edata_keepDEG_Simple),"Step16_result_summary_stats_edata_keepDEG_Simple.csv")
pdf(file="Step16_plot_Visualization_boxplot_MDS_PC_keepDEG_Simple_for_edata.pdf",height=10,width=10)
par(mfrow=c(2,1))
############set colors for chunk Study###############
nconditions <- nlevels(as.factor(pheno$Study))
npal <- colorRampPalette(brewer.pal(nconditions, "Set1"))(nconditions)
condition_colors <- npal[as.integer(pheno$Study)]
############boxplot chunk###############
#Before boxplot edata_keepDEG_Simple
par(mai=c(1,0.8,1,0.8))
boxplot(edata_keepDEG_Simple, outline=FALSE, las=2, cex=0.25, main="edata_keepDEG_Simple", col="yellow")
############MDSplot chunk###############
#Before MDSplot edata_keepDEG_Simple
par(mai=c(1,0.8,1,0.8))
plotMDS(edata_keepDEG_Simple, col=condition_colors, legend= "all", main="edata_keepDEG_Simple", cex=0.5)#like PCA plot
## Warning in plot.window(...): "legend" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "legend" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "legend" is
## not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "legend" is
## not a graphical parameter
## Warning in box(...): "legend" is not a graphical parameter
## Warning in title(...): "legend" is not a graphical parameter
## Warning in text.default(x$x, x$y, labels = labels, cex = cex, ...):
## "legend" is not a graphical parameter
#######PCAplot chunk CellType colored by Study or batch########
#Before PCAplot edata_keepDEG_Simple, colored by Study or batch, labels CellType
pca0=prcomp(t(edata_keepDEG_Simple), center=T, scale=T)
names(pca0)
## [1] "sdev" "rotation" "center" "scale" "x"
#summary(pca0)
pca0$x[1:3,1:3]
## PC1 PC2 PC3
## astro_SRR1033783 -72.43369 19.89243 -10.98101397
## astro_SRR1033784 -70.55181 18.43565 -9.15791909
## endo_SRR1033795 -20.22385 -58.39399 -0.09212854
plot(x=pca0$x[,1], y=pca0$x[,2],
cex=0,
xlab="PC1", ylab="PC2",
main="PC plot colored by Study edata_keepDEG_Simple",
font=2
)
text(x=pca0$x[,1], y=pca0$x[,2],
labels=metadata_1$CellType,
col=as.numeric(metadata_1$Study),
cex=0.7, font=2)
legend("bottomright",
legend=c("GSE52564_to_SRP035309"),
text.col=c("red", "blue", "green")
)
#########PCAplot chunk Species colored by Study or batch######
#Before PCAplot edata_keepDEG_Simple, colored by Study or batch, labels Species
pca0=prcomp(t(edata_keepDEG_Simple), center=T, scale=T)
names(pca0)
## [1] "sdev" "rotation" "center" "scale" "x"
#summary(pca0)
pca0$x[1:3,1:3]
## PC1 PC2 PC3
## astro_SRR1033783 -72.43369 19.89243 -10.98101397
## astro_SRR1033784 -70.55181 18.43565 -9.15791909
## endo_SRR1033795 -20.22385 -58.39399 -0.09212854
plot(x=pca0$x[,1], y=pca0$x[,2],
cex=0,
xlab="PC1", ylab="PC2",
main="PC plot colored by Study edata_keepDEG_Simple",
font=2
)
text(x=pca0$x[,1], y=pca0$x[,2],
labels=metadata_1$Species,
col=as.numeric(metadata_1$Study),
cex=0.7, font=2)
legend("bottomright",
legend=c("GSE52564_to_SRP035309"),
text.col=c("red", "blue", "green")
)
#########PCAplot chunk Age colored by Study or batch########
#Before PCAplot edata_keepDEG_Simple, colored by Study or batch, labels Age
pca0=prcomp(t(edata_keepDEG_Simple), center=T, scale=T)
names(pca0)
## [1] "sdev" "rotation" "center" "scale" "x"
#summary(pca0)
pca0$x[1:3,1:3]
## PC1 PC2 PC3
## astro_SRR1033783 -72.43369 19.89243 -10.98101397
## astro_SRR1033784 -70.55181 18.43565 -9.15791909
## endo_SRR1033795 -20.22385 -58.39399 -0.09212854
plot(x=pca0$x[,1], y=pca0$x[,2],
cex=0,
xlab="PC1", ylab="PC2",
main="PC plot colored by Study edata_keepDEG_Simple",
font=2
)
text(x=pca0$x[,1], y=pca0$x[,2],
labels=metadata_1$Age,
col=as.numeric(metadata_1$Study),
cex=0.7, font=2)
legend("bottomright",
legend=c("GSE52564_to_SRP035309"),
text.col=c("red", "blue", "green")
)
dev.off()
## png
## 2
t_edata_keepDEG_Simple<-t(edata_keepDEG_Simple)
DT::datatable(t_edata_keepDEG_Simple[,1:7])
DT::datatable(pheno)
dim(t_edata_keepDEG_Simple)
## [1] 30 1036
dim(pheno)
## [1] 30 5
#bind the pheno and edata_keepDEG_Simple, this file can be used to predict trait based on a particular gene's gene expression
pheno_t_edata_keepDEG_Simple<-cbind(pheno, t_edata_keepDEG_Simple)
dim(pheno_t_edata_keepDEG_Simple)
## [1] 30 1041
DT::datatable(pheno_t_edata_keepDEG_Simple[,1:10])
write.csv(pheno_t_edata_keepDEG_Simple, "Step16_result_pheno_t_edata_keepDEG_Simple_All_Group_aggregate.csv")
#select the CellType pheno drop others
pheno_t_edata_keepDEG_Simple_CellType=pheno_t_edata_keepDEG_Simple[!names(pheno_t_edata_keepDEG_Simple) %in% c("Age", "Study", "Species", "Tissue")]
dim(pheno_t_edata_keepDEG_Simple_CellType)
## [1] 30 1037
DT::datatable(pheno_t_edata_keepDEG_Simple_CellType[,1:10])
#aggregate edata_keepDEG_Simple or expression for the CellType pheno
#https://campus.datacamp.com/courses/data-table-data-manipulation-r-tutorial/chapter-two-datatable-yeoman?ex=6
library(data.table)
setDT(pheno_t_edata_keepDEG_Simple_CellType)
pheno_t_edata_keepDEG_Simple_CellType_avg=pheno_t_edata_keepDEG_Simple_CellType[, lapply(.SD, mean), by = .(CellType)]
DT::datatable(pheno_t_edata_keepDEG_Simple_CellType_avg[,1:10])
#transpose genes to rows and variable to column
t_pheno_t_edata_keepDEG_Simple_CellType_avg=t(pheno_t_edata_keepDEG_Simple_CellType_avg[,-1])
colnames(t_pheno_t_edata_keepDEG_Simple_CellType_avg)=pheno_t_edata_keepDEG_Simple_CellType_avg$CellType
dim(t_pheno_t_edata_keepDEG_Simple_CellType_avg)
## [1] 1036 2
DT::datatable(t_pheno_t_edata_keepDEG_Simple_CellType_avg[1:10,])
write.csv(t_pheno_t_edata_keepDEG_Simple_CellType_avg, "Step16_result_edata_keepDEG_Simple_CellType_aggregate.csv")
#select the Species pheno drop others
pheno_t_edata_keepDEG_Simple_Species=pheno_t_edata_keepDEG_Simple[!names(pheno_t_edata_keepDEG_Simple) %in% c("Age", "Study", "CellType", "Tissue")]
dim(pheno_t_edata_keepDEG_Simple_Species)
## [1] 30 1037
DT::datatable(pheno_t_edata_keepDEG_Simple_Species[,1:10])
#aggregate edata_keepDEG_Simple or expression for the Species pheno
#https://campus.datacamp.com/courses/data-table-data-manipulation-r-tutorial/chapter-two-datatable-yeoman?ex=6
library(data.table)
setDT(pheno_t_edata_keepDEG_Simple_Species)
pheno_t_edata_keepDEG_Simple_Species_avg=pheno_t_edata_keepDEG_Simple_Species[, lapply(.SD, mean), by = .(Species)]
DT::datatable(pheno_t_edata_keepDEG_Simple_Species_avg[,1:10])
#transpose genes to rows and variable to column
t_pheno_t_edata_keepDEG_Simple_Species_avg=t(pheno_t_edata_keepDEG_Simple_Species_avg[,-1])
colnames(t_pheno_t_edata_keepDEG_Simple_Species_avg)=pheno_t_edata_keepDEG_Simple_Species_avg$Species
dim(t_pheno_t_edata_keepDEG_Simple_Species_avg)
## [1] 1036 2
DT::datatable(t_pheno_t_edata_keepDEG_Simple_Species_avg[1:10,])
write.csv(t_pheno_t_edata_keepDEG_Simple_Species_avg, "Step16_result_edata_keepDEG_Simple_Species_aggregate.csv")
#select the Age pheno drop others
pheno_t_edata_keepDEG_Simple_Age=pheno_t_edata_keepDEG_Simple[!names(pheno_t_edata_keepDEG_Simple) %in% c("CellType", "Study", "Species", "Tissue")]
dim(pheno_t_edata_keepDEG_Simple_Age)
## [1] 30 1037
DT::datatable(pheno_t_edata_keepDEG_Simple_Age[,1:10])
#aggregate edata_keepDEG_Simple or expression for the Age pheno
#https://campus.datacamp.com/courses/data-table-data-manipulation-r-tutorial/chapter-two-datatable-yeoman?ex=6
library(data.table)
setDT(pheno_t_edata_keepDEG_Simple_Age)
pheno_t_edata_keepDEG_Simple_Age_avg=pheno_t_edata_keepDEG_Simple_Age[, lapply(.SD, mean), by = .(Age)]
DT::datatable(pheno_t_edata_keepDEG_Simple_Age_avg[,1:10])
#transpose genes to rows and variable to column
t_pheno_t_edata_keepDEG_Simple_Age_avg=t(pheno_t_edata_keepDEG_Simple_Age_avg[,-1])
colnames(t_pheno_t_edata_keepDEG_Simple_Age_avg)=pheno_t_edata_keepDEG_Simple_Age_avg$Age
dim(t_pheno_t_edata_keepDEG_Simple_Age_avg)
## [1] 1036 2
DT::datatable(t_pheno_t_edata_keepDEG_Simple_Age_avg[1:10,])
write.csv(t_pheno_t_edata_keepDEG_Simple_Age_avg, "Step16_result_edata_keepDEG_Simple_Age_aggregate.csv")
#select the Tissue pheno drop others
pheno_t_edata_keepDEG_Simple_Tissue=pheno_t_edata_keepDEG_Simple[!names(pheno_t_edata_keepDEG_Simple) %in% c("CellType", "Study", "Species", "Age")]
dim(pheno_t_edata_keepDEG_Simple_Tissue)
## [1] 30 1037
DT::datatable(pheno_t_edata_keepDEG_Simple_Tissue[,1:10])
#aggregate edata_keepDEG_Simple or expression for the Tissue pheno
#https://campus.datacamp.com/courses/data-table-data-manipulation-r-tutorial/chapter-two-datatable-yeoman?ex=6
library(data.table)
setDT(pheno_t_edata_keepDEG_Simple_Tissue)
pheno_t_edata_keepDEG_Simple_Tissue_avg=pheno_t_edata_keepDEG_Simple_Tissue[, lapply(.SD, mean), by = .(Tissue)]
DT::datatable(pheno_t_edata_keepDEG_Simple_Tissue_avg[,1:10])
#transpose genes to rows and variable to column
t_pheno_t_edata_keepDEG_Simple_Tissue_avg=t(pheno_t_edata_keepDEG_Simple_Tissue_avg[,-1])
colnames(t_pheno_t_edata_keepDEG_Simple_Tissue_avg)=pheno_t_edata_keepDEG_Simple_Tissue_avg$Tissue
dim(t_pheno_t_edata_keepDEG_Simple_Tissue_avg)
## [1] 1036 4
DT::datatable(t_pheno_t_edata_keepDEG_Simple_Tissue_avg[1:10,])
write.csv(t_pheno_t_edata_keepDEG_Simple_Tissue_avg, "Step16_result_edata_keepDEG_Simple_Tissue_aggregate.csv")
t_pheno_t_edata_keepDEG_Simple_all_avg<-cbind(t_pheno_t_edata_keepDEG_Simple_CellType_avg, t_pheno_t_edata_keepDEG_Simple_Species_avg, t_pheno_t_edata_keepDEG_Simple_Age_avg, t_pheno_t_edata_keepDEG_Simple_Tissue_avg)
DT::datatable(head(t_pheno_t_edata_keepDEG_Simple_all_avg))
write.csv(t_pheno_t_edata_keepDEG_Simple_all_avg, "Step16_result_edata_keepDEG_Simple_All_Group_aggregate.csv")
#save pheno_t_edata_keepDEG_Simple, t_pheno_t_edata_keepDEG_Simple_CellType_avg, t_pheno_t_edata_keepDEG_Simple_Species_avg, t_pheno_t_edata_keepDEG_Simple_Age_avg, t_pheno_t_edata_keepDEG_Simple_Tissue_avg, t_pheno_t_edata_keepDEG_Simple_all_avg
save(pheno_t_edata_keepDEG_Simple, t_pheno_t_edata_keepDEG_Simple_CellType_avg, t_pheno_t_edata_keepDEG_Simple_Species_avg, t_pheno_t_edata_keepDEG_Simple_Age_avg, t_pheno_t_edata_keepDEG_Simple_Tissue_avg, t_pheno_t_edata_keepDEG_Simple_all_avg, file="edata_keepDEG_Simple_aggregate_data.RData")
pdf(file="Step16_plot_boxplot_corr_plot_edata_keepDEG_Simple_aka_expr_aggregate.pdf",height=10,width=10)
par(mfrow=c(2,1))
#########boxplot CellType Species Age Tissue and all ########
#box plot on aggregate CellType
boxplot(t_pheno_t_edata_keepDEG_Simple_CellType_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_keepDEG_Simple_CellType_avg", col="yellow")
#box plot on aggregate Species
boxplot(t_pheno_t_edata_keepDEG_Simple_Species_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_keepDEG_Simple_Species_avg", col="yellow")
#box plot on aggregate Age
boxplot(t_pheno_t_edata_keepDEG_Simple_Age_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_keepDEG_Simple_Age_avg", col="yellow")
#box plot on aggregate Tissue
boxplot(t_pheno_t_edata_keepDEG_Simple_Tissue_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_keepDEG_Simple_Tissue_avg", col="yellow")
#box plot on aggregate all
boxplot(t_pheno_t_edata_keepDEG_Simple_all_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_keepDEG_Simple_all_avg", col="yellow")
#########corr plot CellType Species Age Tissue and all ########
#create cor matrix on aggregate CellType
rel2 <- rcorr(as.matrix(t_pheno_t_edata_keepDEG_Simple_CellType_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step16_result_edata_keepDEG_Simple_CellType_Group_corr_pval_aggregate.csv")
#create cor matrix on aggregate Species
rel2 <- rcorr(as.matrix(t_pheno_t_edata_keepDEG_Simple_Species_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step16_result_edata_keepDEG_Simple_Species_Group_corr_pval_aggregate.csv")
#create cor matrix on aggregate Age
rel2 <- rcorr(as.matrix(t_pheno_t_edata_keepDEG_Simple_Age_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step16_result_edata_keepDEG_Simple_Age_Group_corr_pval_aggregate.csv")
#create cor matrix on aggregate Tissue
rel2 <- rcorr(as.matrix(t_pheno_t_edata_keepDEG_Simple_Tissue_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step16_result_edata_keepDEG_Simple_Tissue_Group_corr_pval_aggregate.csv")
#create cor matrix on aggregate all
rel2 <- rcorr(as.matrix(t_pheno_t_edata_keepDEG_Simple_all_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step16_result_edata_keepDEG_Simple_all_Group_corr_pval_aggregate.csv")
dev.off()
## png
## 2
pdf(file="Step16_plot_density_edata_keepDEG_Simple_aka_expr_aggregate.pdf",height=10,width=10)
#########density plot CellType Species Age Tissue and all ########
#density plot on aggregate CellType
df=as.data.frame(t_pheno_t_edata_keepDEG_Simple_CellType_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_keepDEG_Simple_CellType_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) +
geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
labs(title="gene expression density curve for CellType",x="t_pheno_t_edata_keepDEG_Simple_CellType_avg", y = "Density")
#density plot on aggregate Species
df=as.data.frame(t_pheno_t_edata_keepDEG_Simple_Species_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_keepDEG_Simple_Species_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) +
geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
labs(title="gene expression density curve for Species",x="t_pheno_t_edata_keepDEG_Simple_Species_avg", y = "Density")
#density plot on aggregate Age
df=as.data.frame(t_pheno_t_edata_keepDEG_Simple_Age_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_keepDEG_Simple_Age_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) +
geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
labs(title="gene expression density curve for Age",x="t_pheno_t_edata_keepDEG_Simple_Age_avg", y = "Density")
#density plot on aggregate Tissue
df=as.data.frame(t_pheno_t_edata_keepDEG_Simple_Tissue_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_keepDEG_Simple_Tissue_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) +
geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
labs(title="gene expression density curve for Tissue",x="t_pheno_t_edata_keepDEG_Simple_Tissue_avg", y = "Density")
#density plot on aggregate all
df=as.data.frame(t_pheno_t_edata_keepDEG_Simple_all_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_keepDEG_Simple_all_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) +
geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
labs(title="gene expression density curve for all",x="t_pheno_t_edata_keepDEG_Simple_all_avg", y = "Density")
dev.off()
## png
## 2
#save pheno, edata
save(pheno, metadata_1, edata, edata_keepDEG_Simple, file="edata_keepDEG_Simple_data.RData")
#save image
save.image(file="DEG_Simple_temp.RData")
rm(list=ls())
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3997945 213.6 8074265 431.3 8074265 431.3
## Vcells 7515269 57.4 29568012 225.6 30716424 234.4
#Load pheno, edata and edata_combatY required for steps below
load(file="done_pre_Step8.RData")
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4197803 224.2 8074265 431.3 8074265 431.3
## Vcells 16901517 129.0 29568012 225.6 30716424 234.4
#load required libraries
#source("https://bioconductor.org/biocLite.R")
#biocLite(c("sva","limma","bladderbatch","pamr"))
library(limma)
library(sva)
library(pamr)
library(ggplot2) # plot
library(ggrepel) #Avoid overlapping labels
library(ggmap)#plots
library(gplots)#plots
library(RColorBrewer)#color pallet
library(Hmisc)#for corelation plot
library(viridis)#for corelation plot
library(corrplot)#for corelation plot
library(dplyr)
library(edgeR) # for RNAseq DE testing
#From here identify column numbers of interest for next step
colnames(pheno)
## [1] "CellType" "Age" "Study" "Species" "Tissue"
#edgeR DGElist
#Note here we use the generic DEG_edgeR differential expression calculation step
#mod only includes the variables of interest and covariants, not surrogate variables
#https://bioconductor.org/packages/release/bioc/manuals/edgeR/man/edgeR.pdf
#https://bioconductor.org/packages/release/bioc/vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf
#Creating a DGEList object: a container for the counts, metadata - these include, for example, sample names, gene names and normalisation factors once these are computed. The DGEList is an example of the custom task-specific structures that are frequently used in Bioconductor to make analyses easier.
#counts must be positive finite values so we take antilog of the edata which is log2 space
counts=(2^edata-1)#because we had log2+1
length(which(apply(counts, 1, function(row) any(row <0)))) #number of negative values
## [1] 0
#option1 to replace negative values with column mean
counts[counts<0] <-NA #replace negative values with NA
if_na(counts)=t(mean_col(counts)) #replace NA with mean of column or sample
#option2 to remove negative value containing rows
#counts[counts<0] <-NA #replace negative values with NA
#counts<-counts[!is.na(counts)]
dgList <- DGEList(counts=counts, genes=rownames(edata))
#dgList <- DGEList(counts=b, genes=rownames(b))
dim(dgList)#lib.size is calculated by default
## [1] 22123 30
dgList[1:3,1:3]
## An object of class "DGEList"
## $counts
## astro_SRR1033783 astro_SRR1033784 endo_SRR1033795
## 0610009B22Rik 30.037654 27.756307 20.2454671
## 0610009E02Rik 1.750835 1.163168 0.2837639
## 0610009L18Rik 17.582018 11.772189 2.0467289
##
## $samples
## group lib.size norm.factors
## astro_SRR1033783 1 999373.1 1
## astro_SRR1033784 1 999271.4 1
## endo_SRR1033795 1 999719.2 1
##
## $genes
## genes
## 0610009B22Rik 0610009B22Rik
## 0610009E02Rik 0610009E02Rik
## 0610009L18Rik 0610009L18Rik
#edgeR: Normalization calcNormFactors
#Normalization: important to normalise RNA-seq both within and between samples. edgeR has trimmed mean of M-values (TMM) method, and other options also available. TMM normalization lessens the impact of reads falling into a small set of genes
#since our edata is already loOtpm normalized we skip thi step, if using raw counts like htseq2Counts then use this
#dgList <- calcNormFactors(dgList, method="TMM")
#dgList[1:3,1:3] #new lib.size re-calculated is now added here
#lib.size norm factor <1 indicates small number of high counts are monopolizing seq data causing other genes to be lower, so library size is scaled down. Conversely, for >1 library size is scaled up analogous to downscaling the counts.
#edgeR: create model matrix
#Contrasts can be applied only to factors with 2 or more levels.
mod_DEG_edgeR = model.matrix(~0+CellType+Study, data=pheno)
#coefficients available for edgeR differential gene expression
DT::datatable(mod_DEG_edgeR)
#Coefficients if not estimable i.e. gives NAs in coefficient then remove. Try to use the max number of variables
#mod_DEG_edgeR = model.matrix(~0+CellType+Study, data=pheno) #Tissue and Study is same so removed Tissue
#mod_DEG_edgeR = model.matrix(~0+CellType+Study+Species+Tissue+Age, data=pheno)
#mod_DEG_edgeR = model.matrix(~0+CellType+Species+Age, data=pheno)
#edgeR: estimate dispersion use model matrix
dgList <- estimateGLMCommonDisp(dgList, design=mod_DEG_edgeR)
dgList <- estimateGLMTrendedDisp(dgList, design=mod_DEG_edgeR)
#Note that either the common or trended dispersion needs to be estimated before we can estimate the tagwise dispersion.
dgList <- estimateGLMTagwiseDisp(dgList, design=mod_DEG_edgeR)
#edgeR model fitting
fit_DEG_edgeR <- glmFit(dgList, mod_DEG_edgeR)
summary(fit_DEG_edgeR)
## Length Class Mode
## coefficients 88492 -none- numeric
## fitted.values 663690 -none- numeric
## deviance 22123 -none- numeric
## method 1 -none- character
## counts 663690 -none- numeric
## unshrunk.coefficients 88492 -none- numeric
## df.residual 22123 -none- numeric
## design 120 -none- numeric
## offset 663690 CompressedMatrix numeric
## dispersion 22123 -none- numeric
## prior.count 1 -none- numeric
## samples 3 data.frame list
## genes 1 data.frame list
## prior.df 1 -none- numeric
## AveLogCPM 22123 -none- numeric
DT::datatable(fit_DEG_edgeR$coefficients)
## Warning in instance$preRenderHook(instance): It seems your data is too
## big for client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html
#Use the values in the creating_CellType_contrasts as the first three values of contrast matrix for CellType
design_DEG_edgeR<-model.matrix(~0+CellType, data=pheno)
colnames(design_DEG_edgeR)
## [1] "CellTypemacrophages" "CellTypeother"
design_DEG_edgeR[1:3,]
## CellTypemacrophages CellTypeother
## astro_SRR1033783 0 1
## astro_SRR1033784 0 1
## endo_SRR1033795 0 1
DT::datatable(design_DEG_edgeR)
#comparisons you want write here
creating_CellType_contrasts_DEG_edgeR<-makeContrasts(macrophagesBYo=CellTypemacrophages-CellTypeother, levels =design_DEG_edgeR)
creating_CellType_contrasts_DEG_edgeR
## Contrasts
## Levels macrophagesBYo
## CellTypemacrophages 1
## CellTypeother -1
#Create contrasts to find the differentially expressed genes .
#We hold other variables and surrogate variables at 0.
#Here the net number of rows is same and correspond to columns in fit_DEG_edgeR$coefficients
contrast.matrix_DEG_edgeR <- cbind("macrophagesBYo"=c(1,-1,rep(0,2)))
contrast.matrix_DEG_edgeR
## macrophagesBYo
## [1,] 1
## [2,] -1
## [3,] 0
## [4,] 0
#Identification of differentially expressed genes for the contrasts
#https://rstudio-pubs-static.s3.amazonaws.com/79395_b07ae39ce8124a5c873bd46d6075c137.html
#https://support.bioconductor.org/p/65751/
#https://support.bioconductor.org/p/47643/
lrt_macrophagesBYo_DEG_edgeR <- glmLRT(fit_DEG_edgeR, contrast=contrast.matrix_DEG_edgeR[,"macrophagesBYo"])
#DEG_edgeR for contrast "macrophagesBYo"=CellTypemacrophages-CellTypeother
result_macrophagesBYo_full_DEG_edgeR<- topTags(lrt_macrophagesBYo_DEG_edgeR, n = nrow(edata), adjust.method="BH")$table
names(result_macrophagesBYo_full_DEG_edgeR)[names(result_macrophagesBYo_full_DEG_edgeR) == 'FDR'] <- 'adj.P.Val'
result_macrophagesBYo_full_DEG_edgeR$Contrast<-rep("macrophagesBYo",nrow(result_macrophagesBYo_full_DEG_edgeR))
result_macrophagesBYo_full_DEG_edgeR$FC<- 2^(result_macrophagesBYo_full_DEG_edgeR$logFC)
#drop extra genes column
result_macrophagesBYo_full_DEG_edgeR <- result_macrophagesBYo_full_DEG_edgeR[!names(result_macrophagesBYo_full_DEG_edgeR) %in% c("genes")]
result_macrophagesBYo_FC5.00_DEG_edgeR<- result_macrophagesBYo_full_DEG_edgeR[which(result_macrophagesBYo_full_DEG_edgeR$FC >= 5.00), ]
dim(result_macrophagesBYo_full_DEG_edgeR)
## [1] 22123 7
dim(result_macrophagesBYo_FC5.00_DEG_edgeR)
## [1] 1412 7
write.csv(result_macrophagesBYo_full_DEG_edgeR, "Step17_result_macrophagesBYo_full_DEG_edgeR.csv")
write.csv(result_macrophagesBYo_FC5.00_DEG_edgeR, "Step17_result_macrophagesBYo_FC5.00_DEG_edgeR.csv")
summary(result_macrophagesBYo_full_DEG_edgeR)
## logFC logCPM LR PValue
## Min. :-14.6939 Min. : 1.052 Min. : 0.00000 Min. :0.0000
## 1st Qu.: 0.0000 1st Qu.: 1.391 1st Qu.: 0.00009 1st Qu.:0.2796
## Median : 0.1188 Median : 2.443 Median : 0.17031 Median :0.6798
## Mean : 0.3804 Mean : 3.015 Mean : 1.33079 Mean :0.6082
## 3rd Qu.: 0.9655 3rd Qu.: 4.096 3rd Qu.: 1.16881 3rd Qu.:0.9925
## Max. : 14.4440 Max. :14.710 Max. :96.30633 Max. :1.0000
## adj.P.Val Contrast FC
## Min. :0.0000 Length:22123 Min. : 0.000
## 1st Qu.:1.0000 Class :character 1st Qu.: 1.000
## Median :1.0000 Mode :character Median : 1.086
## Mean :0.9034 Mean : 5.606
## 3rd Qu.:1.0000 3rd Qu.: 1.953
## Max. :1.0000 Max. :22288.571
summary(result_macrophagesBYo_FC5.00_DEG_edgeR)
## logFC logCPM LR PValue
## Min. : 2.323 Min. : 1.082 Min. : 0.6465 Min. :0.000000
## 1st Qu.: 2.707 1st Qu.: 1.789 1st Qu.: 2.6481 1st Qu.:0.003373
## Median : 3.257 Median : 2.987 Median : 4.7605 Median :0.029119
## Mean : 3.726 Mean : 3.455 Mean : 6.7557 Mean :0.065095
## 3rd Qu.: 4.302 3rd Qu.: 4.503 3rd Qu.: 8.5938 3rd Qu.:0.103672
## Max. :14.444 Max. :14.067 Max. :59.5245 Max. :0.421373
## adj.P.Val Contrast FC
## Min. :0.0000 Length:1412 Min. : 5.003
## 1st Qu.:0.1135 Class :character 1st Qu.: 6.530
## Median :0.3921 Mode :character Median : 9.560
## Mean :0.4399 Mean : 67.407
## 3rd Qu.:0.7541 3rd Qu.: 19.731
## Max. :1.0000 Max. :22288.571
#Bind all differentally expressed full genes into a single table
result_allContrasts_full_DEG_edgeR<-rbind(result_macrophagesBYo_full_DEG_edgeR)#here only one table so rbind not required
dim(result_allContrasts_full_DEG_edgeR)
## [1] 22123 7
write.csv(result_allContrasts_full_DEG_edgeR, "Step17_result_allContrasts_full_DEG_edgeR.csv")
#Bind all differentally expressed pval genes into a single table
result_allContrasts_FC5.00_DEG_edgeR<-rbind(result_macrophagesBYo_FC5.00_DEG_edgeR)#here only one table so rbind not required
dim(result_allContrasts_FC5.00_DEG_edgeR)
## [1] 1412 7
write.csv(result_allContrasts_FC5.00_DEG_edgeR, "Step17_result_allContrasts_FC5.00_DEG_edgeR.csv")
#https://www.biostars.org/p/203876/
pdf(file="Step17_plot_Volcano_result_adjFC5.00_DEG_edgeR.pdf",height=10,width=10)
#par(mfrow=c(2,2))
df<-result_macrophagesBYo_full_DEG_edgeR
df$gene<-rownames(result_macrophagesBYo_full_DEG_edgeR) #create column for genes
mutateddf <- mutate(df, sig=ifelse(df$adj.P.Val<0.05, "adj.P.Val<0.05", "Not Sig"))
#Will have different colors depending on significance
#volcanoplot with logFC versus adj.P.Val
volc = ggplot(mutateddf, aes(logFC, -log10(adj.P.Val))) +
geom_point(aes(col=sig)) + #add points colored by significance
scale_color_manual(values=c("red", "black")) +
ggtitle("Volcanoplot DEG_edgeR")+
geom_text_repel(data=head(mutateddf, 10), aes(label=gene))
#adding text for the top 20 genes
#ggsave("Volcanoplot.jpeg", device="jpeg")
volc
dev.off()
## png
## 2
#https://www.biostars.org/p/203876/
pdf(file="Step17_plot_Volcano_result_adjPValFC_DEG_edgeR.pdf",height=10,width=10)
#par(mfrow=c(2,2))
df<-result_macrophagesBYo_full_DEG_edgeR
df$gene<-rownames(result_macrophagesBYo_full_DEG_edgeR) #create column for genes
mutateddf <- mutate(df, sig=ifelse((df$adj.P.Val < 0.05 & abs(df$logFC) > 2.32), "Sig adj.P.Val&FC", "Not Sig"))
#Will have different colors depending on significance
#volcanoplot with logFC versus adj.P.Val
volc = ggplot(mutateddf, aes(logFC, -log10(adj.P.Val))) +
geom_point(aes(col=sig)) + #add points colored by significance
scale_color_manual(values=c("black", "red")) +
ggtitle("Volcanoplot DEG_edgeR")+
geom_text_repel(data=head(mutateddf, 10), aes(label=gene))
#adding text for the top 20 genes
#ggsave("Volcanoplot.jpeg", device="jpeg")
volc
dev.off()
## png
## 2
df<-result_macrophagesBYo_full_DEG_edgeR
df$gene<-rownames(result_macrophagesBYo_full_DEG_edgeR) #create column for genes
mutateddf <- mutate(df, sig=ifelse((df$adj.P.Val < 0.05 & abs(df$logFC) > 2.32), "Sig adj.P.Val&FC", "Not Sig"))
#show UP100 or in bar plot
mutateddf_UP100 <-mutateddf[order(-mutateddf$logFC), ] #descenting order sort
mutateddf_UP100 <- mutateddf_UP100[1:100,] #top100 upregulated genes
#ggplot colors available http://sape.inf.usi.ch/quick-reference/ggplot2/colour
pdf(file="Step17_plot_barplot_result_adjPValUP100_DEG_edgeR.pdf",height=5,width=10)
#Will have different colors depending on significance
bar1<-ggplot(mutateddf_UP100, aes(x=gene, y=logFC, width=.7))+
geom_bar(position="stack", fill="red4", stat="identity")+
labs(title=paste0("Barplot top genes with UP100 with significant adj.P.Val"))+
theme(axis.text.x=element_text(angle=90,hjust=1))
#labs(x="genes")+
#labs(y="fold change")+
#labs(caption="DEG_edgeR")+
#scale_fill_gradient(low="black",high="red")+
#theme(plot.margin = margin(2,2,2,2, "cm"), plot.background = element_rect(fill = "white"))
print(bar1)
dev.off()
## png
## 2
edata_keepDEG_edgeR_UP100<- edata[which(rownames(edata) %in% mutateddf_UP100$gene), ]
dim(mutateddf_UP100)
## [1] 100 9
dim(edata_keepDEG_edgeR_UP100)
## [1] 100 30
pdf(file="Step17_plot_heatmap_result_adjPValUP100_DEG_edgeR.pdf",height=15,width=15)
#mapdata<-edata[rownames(edata)==mutateddf$gene,]
mapdata<-edata_keepDEG_edgeR_UP100
mapdata_matrix <- data.matrix(mapdata)
colfunc<-colorRampPalette(c("red","green")) #because we are blue group!
hr<-hclust(as.dist(1-cor(t(mapdata_matrix),method="pearson")),method="complete")
plotheatmap<-heatmap.2(mapdata_matrix,Rowv=as.dendrogram(hr),scale="row",density.info="none",trace="none",col=colfunc(256),cexRow=0.4,cexCol=0.4)
dev.off()
## png
## 2
#genes which are sig differentially expressed in contrasts
dim(result_allContrasts_FC5.00_DEG_edgeR)
## [1] 1412 7
DEG_edgeRnames=rownames(result_allContrasts_FC5.00_DEG_edgeR)
head(DEG_edgeRnames)
## [1] "Cd5l" "Alox15" "Saa3" "Serpinb2" "Arg1" "Gm26863"
length(DEG_edgeRnames)
## [1] 1412
#save result_allContrasts_full_DEG_edgeR, result_allContrasts_FC5.00_DEG_edgeR, DEG_edgeRnames
save(result_allContrasts_full_DEG_edgeR, result_allContrasts_FC5.00_DEG_edgeR, DEG_edgeRnames, file="DEG_edgeR_data.RData")
#Keep only those genes which are differentially expressed in contrasts
dim(edata)
## [1] 22123 30
edata_keepDEG_edgeR<- edata[which(rownames(edata) %in% DEG_edgeRnames), ]
dim(edata_keepDEG_edgeR)
## [1] 1412 30
#unique and common genes between the different contrasts DEG_edgeR are retained
DT::datatable(edata_keepDEG_edgeR[1:7,1:7])
#Export edata keepDEG_edgeR genes only
write.csv(edata_keepDEG_edgeR, "Step18_result_edata_keepDEG_edgeR.csv")
edata_keepDEG_edgeR filtered to keepDEG_edgeR genes
#Summary Statistics edata_keepDEG_edgeR
DT::datatable(summary(edata_keepDEG_edgeR[1:7,1:7]))
write.csv(summary(edata_keepDEG_edgeR),"Step18_result_summary_stats_edata_keepDEG_edgeR.csv")
pdf(file="Step18_plot_Visualization_boxplot_MDS_PC_keepDEG_edgeR_for_edata.pdf",height=10,width=10)
par(mfrow=c(2,1))
############set colors for chunk Study###############
nconditions <- nlevels(as.factor(pheno$Study))
npal <- colorRampPalette(brewer.pal(nconditions, "Set1"))(nconditions)
condition_colors <- npal[as.integer(pheno$Study)]
############boxplot chunk###############
#Before boxplot edata_keepDEG_edgeR
par(mai=c(1,0.8,1,0.8))
boxplot(edata_keepDEG_edgeR, outline=FALSE, las=2, cex=0.25, main="edata_keepDEG_edgeR", col="yellow")
############MDSplot chunk###############
#Before MDSplot edata_keepDEG_edgeR
par(mai=c(1,0.8,1,0.8))
plotMDS(edata_keepDEG_edgeR, col=condition_colors, legend= "all", main="edata_keepDEG_edgeR", cex=0.5)#like PCA plot
## Warning in plot.window(...): "legend" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "legend" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "legend" is
## not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "legend" is
## not a graphical parameter
## Warning in box(...): "legend" is not a graphical parameter
## Warning in title(...): "legend" is not a graphical parameter
## Warning in text.default(x$x, x$y, labels = labels, cex = cex, ...):
## "legend" is not a graphical parameter
#######PCAplot chunk CellType colored by Study or batch########
#Before PCAplot edata_keepDEG_edgeR, colored by Study or batch, labels CellType
pca0=prcomp(t(edata_keepDEG_edgeR), center=T, scale=T)
names(pca0)
## [1] "sdev" "rotation" "center" "scale" "x"
#summary(pca0)
pca0$x[1:3,1:3]
## PC1 PC2 PC3
## astro_SRR1033783 -11.405299 -27.182588 4.240269
## astro_SRR1033784 -9.222653 -30.432574 5.262432
## endo_SRR1033795 -12.643325 -2.270665 2.066303
plot(x=pca0$x[,1], y=pca0$x[,2],
cex=0,
xlab="PC1", ylab="PC2",
main="PC plot colored by Study edata_keepDEG_edgeR",
font=2
)
text(x=pca0$x[,1], y=pca0$x[,2],
labels=metadata_1$CellType,
col=as.numeric(metadata_1$Study),
cex=0.7, font=2)
legend("bottomright",
legend=c("GSE52564_to_SRP035309"),
text.col=c("red", "blue", "green")
)
#########PCAplot chunk Species colored by Study or batch######
#Before PCAplot edata_keepDEG_edgeR, colored by Study or batch, labels Species
pca0=prcomp(t(edata_keepDEG_edgeR), center=T, scale=T)
names(pca0)
## [1] "sdev" "rotation" "center" "scale" "x"
#summary(pca0)
pca0$x[1:3,1:3]
## PC1 PC2 PC3
## astro_SRR1033783 -11.405299 -27.182588 4.240269
## astro_SRR1033784 -9.222653 -30.432574 5.262432
## endo_SRR1033795 -12.643325 -2.270665 2.066303
plot(x=pca0$x[,1], y=pca0$x[,2],
cex=0,
xlab="PC1", ylab="PC2",
main="PC plot colored by Study edata_keepDEG_edgeR",
font=2
)
text(x=pca0$x[,1], y=pca0$x[,2],
labels=metadata_1$Species,
col=as.numeric(metadata_1$Study),
cex=0.7, font=2)
legend("bottomright",
legend=c("GSE52564_to_SRP035309"),
text.col=c("red", "blue", "green")
)
#########PCAplot chunk Age colored by Study or batch########
#Before PCAplot edata_keepDEG_edgeR, colored by Study or batch, labels Age
pca0=prcomp(t(edata_keepDEG_edgeR), center=T, scale=T)
names(pca0)
## [1] "sdev" "rotation" "center" "scale" "x"
#summary(pca0)
pca0$x[1:3,1:3]
## PC1 PC2 PC3
## astro_SRR1033783 -11.405299 -27.182588 4.240269
## astro_SRR1033784 -9.222653 -30.432574 5.262432
## endo_SRR1033795 -12.643325 -2.270665 2.066303
plot(x=pca0$x[,1], y=pca0$x[,2],
cex=0,
xlab="PC1", ylab="PC2",
main="PC plot colored by Study edata_keepDEG_edgeR",
font=2
)
text(x=pca0$x[,1], y=pca0$x[,2],
labels=metadata_1$Age,
col=as.numeric(metadata_1$Study),
cex=0.7, font=2)
legend("bottomright",
legend=c("GSE52564_to_SRP035309"),
text.col=c("red", "blue", "green")
)
dev.off()
## png
## 2
t_edata_keepDEG_edgeR<-t(edata_keepDEG_edgeR)
DT::datatable(t_edata_keepDEG_edgeR[,1:7])
DT::datatable(pheno)
dim(t_edata_keepDEG_edgeR)
## [1] 30 1412
dim(pheno)
## [1] 30 5
#bind the pheno and edata_keepDEG_edgeR, this file can be used to predict trait based on a particular gene's gene expression
pheno_t_edata_keepDEG_edgeR<-cbind(pheno, t_edata_keepDEG_edgeR)
dim(pheno_t_edata_keepDEG_edgeR)
## [1] 30 1417
DT::datatable(pheno_t_edata_keepDEG_edgeR[,1:10])
write.csv(pheno_t_edata_keepDEG_edgeR, "Step18_result_pheno_t_edata_keepDEG_edgeR_All_Group_aggregate.csv")
#select the CellType pheno drop others
pheno_t_edata_keepDEG_edgeR_CellType=pheno_t_edata_keepDEG_edgeR[!names(pheno_t_edata_keepDEG_edgeR) %in% c("Age", "Study", "Species", "Tissue")]
dim(pheno_t_edata_keepDEG_edgeR_CellType)
## [1] 30 1413
DT::datatable(pheno_t_edata_keepDEG_edgeR_CellType[,1:10])
#aggregate edata_keepDEG_edgeR or expression for the CellType pheno
#https://campus.datacamp.com/courses/data-table-data-manipulation-r-tutorial/chapter-two-datatable-yeoman?ex=6
library(data.table)
setDT(pheno_t_edata_keepDEG_edgeR_CellType)
pheno_t_edata_keepDEG_edgeR_CellType_avg=pheno_t_edata_keepDEG_edgeR_CellType[, lapply(.SD, mean), by = .(CellType)]
DT::datatable(pheno_t_edata_keepDEG_edgeR_CellType_avg[,1:10])
#transpose genes to rows and variable to column
t_pheno_t_edata_keepDEG_edgeR_CellType_avg=t(pheno_t_edata_keepDEG_edgeR_CellType_avg[,-1])
colnames(t_pheno_t_edata_keepDEG_edgeR_CellType_avg)=pheno_t_edata_keepDEG_edgeR_CellType_avg$CellType
dim(t_pheno_t_edata_keepDEG_edgeR_CellType_avg)
## [1] 1412 2
DT::datatable(t_pheno_t_edata_keepDEG_edgeR_CellType_avg[1:10,])
write.csv(t_pheno_t_edata_keepDEG_edgeR_CellType_avg, "Step18_result_edata_keepDEG_edgeR_CellType_aggregate.csv")
#select the Species pheno drop others
pheno_t_edata_keepDEG_edgeR_Species=pheno_t_edata_keepDEG_edgeR[!names(pheno_t_edata_keepDEG_edgeR) %in% c("Age", "Study", "CellType", "Tissue")]
dim(pheno_t_edata_keepDEG_edgeR_Species)
## [1] 30 1413
DT::datatable(pheno_t_edata_keepDEG_edgeR_Species[,1:10])
#aggregate edata_keepDEG_edgeR or expression for the Species pheno
#https://campus.datacamp.com/courses/data-table-data-manipulation-r-tutorial/chapter-two-datatable-yeoman?ex=6
library(data.table)
setDT(pheno_t_edata_keepDEG_edgeR_Species)
pheno_t_edata_keepDEG_edgeR_Species_avg=pheno_t_edata_keepDEG_edgeR_Species[, lapply(.SD, mean), by = .(Species)]
DT::datatable(pheno_t_edata_keepDEG_edgeR_Species_avg[,1:10])
#transpose genes to rows and variable to column
t_pheno_t_edata_keepDEG_edgeR_Species_avg=t(pheno_t_edata_keepDEG_edgeR_Species_avg[,-1])
colnames(t_pheno_t_edata_keepDEG_edgeR_Species_avg)=pheno_t_edata_keepDEG_edgeR_Species_avg$Species
dim(t_pheno_t_edata_keepDEG_edgeR_Species_avg)
## [1] 1412 2
DT::datatable(t_pheno_t_edata_keepDEG_edgeR_Species_avg[1:10,])
write.csv(t_pheno_t_edata_keepDEG_edgeR_Species_avg, "Step18_result_edata_keepDEG_edgeR_Species_aggregate.csv")
#select the Age pheno drop others
pheno_t_edata_keepDEG_edgeR_Age=pheno_t_edata_keepDEG_edgeR[!names(pheno_t_edata_keepDEG_edgeR) %in% c("CellType", "Study", "Species", "Tissue")]
dim(pheno_t_edata_keepDEG_edgeR_Age)
## [1] 30 1413
DT::datatable(pheno_t_edata_keepDEG_edgeR_Age[,1:10])
#aggregate edata_keepDEG_edgeR or expression for the Age pheno
#https://campus.datacamp.com/courses/data-table-data-manipulation-r-tutorial/chapter-two-datatable-yeoman?ex=6
library(data.table)
setDT(pheno_t_edata_keepDEG_edgeR_Age)
pheno_t_edata_keepDEG_edgeR_Age_avg=pheno_t_edata_keepDEG_edgeR_Age[, lapply(.SD, mean), by = .(Age)]
DT::datatable(pheno_t_edata_keepDEG_edgeR_Age_avg[,1:10])
#transpose genes to rows and variable to column
t_pheno_t_edata_keepDEG_edgeR_Age_avg=t(pheno_t_edata_keepDEG_edgeR_Age_avg[,-1])
colnames(t_pheno_t_edata_keepDEG_edgeR_Age_avg)=pheno_t_edata_keepDEG_edgeR_Age_avg$Age
dim(t_pheno_t_edata_keepDEG_edgeR_Age_avg)
## [1] 1412 2
DT::datatable(t_pheno_t_edata_keepDEG_edgeR_Age_avg[1:10,])
write.csv(t_pheno_t_edata_keepDEG_edgeR_Age_avg, "Step18_result_edata_keepDEG_edgeR_Age_aggregate.csv")
#select the Tissue pheno drop others
pheno_t_edata_keepDEG_edgeR_Tissue=pheno_t_edata_keepDEG_edgeR[!names(pheno_t_edata_keepDEG_edgeR) %in% c("CellType", "Study", "Species", "Age")]
dim(pheno_t_edata_keepDEG_edgeR_Tissue)
## [1] 30 1413
DT::datatable(pheno_t_edata_keepDEG_edgeR_Tissue[,1:10])
#aggregate edata_keepDEG_edgeR or expression for the Tissue pheno
#https://campus.datacamp.com/courses/data-table-data-manipulation-r-tutorial/chapter-two-datatable-yeoman?ex=6
library(data.table)
setDT(pheno_t_edata_keepDEG_edgeR_Tissue)
pheno_t_edata_keepDEG_edgeR_Tissue_avg=pheno_t_edata_keepDEG_edgeR_Tissue[, lapply(.SD, mean), by = .(Tissue)]
DT::datatable(pheno_t_edata_keepDEG_edgeR_Tissue_avg[,1:10])
#transpose genes to rows and variable to column
t_pheno_t_edata_keepDEG_edgeR_Tissue_avg=t(pheno_t_edata_keepDEG_edgeR_Tissue_avg[,-1])
colnames(t_pheno_t_edata_keepDEG_edgeR_Tissue_avg)=pheno_t_edata_keepDEG_edgeR_Tissue_avg$Tissue
dim(t_pheno_t_edata_keepDEG_edgeR_Tissue_avg)
## [1] 1412 4
DT::datatable(t_pheno_t_edata_keepDEG_edgeR_Tissue_avg[1:10,])
write.csv(t_pheno_t_edata_keepDEG_edgeR_Tissue_avg, "Step18_result_edata_keepDEG_edgeR_Tissue_aggregate.csv")
t_pheno_t_edata_keepDEG_edgeR_all_avg<-cbind(t_pheno_t_edata_keepDEG_edgeR_CellType_avg, t_pheno_t_edata_keepDEG_edgeR_Species_avg, t_pheno_t_edata_keepDEG_edgeR_Age_avg, t_pheno_t_edata_keepDEG_edgeR_Tissue_avg)
DT::datatable(head(t_pheno_t_edata_keepDEG_edgeR_all_avg))
write.csv(t_pheno_t_edata_keepDEG_edgeR_all_avg, "Step18_result_edata_keepDEG_edgeR_All_Group_aggregate.csv")
#save pheno_t_edata_keepDEG_edgeR, t_pheno_t_edata_keepDEG_edgeR_CellType_avg, t_pheno_t_edata_keepDEG_edgeR_Species_avg, t_pheno_t_edata_keepDEG_edgeR_Age_avg, t_pheno_t_edata_keepDEG_edgeR_Tissue_avg, t_pheno_t_edata_keepDEG_edgeR_all_avg
save(pheno_t_edata_keepDEG_edgeR, t_pheno_t_edata_keepDEG_edgeR_CellType_avg, t_pheno_t_edata_keepDEG_edgeR_Species_avg, t_pheno_t_edata_keepDEG_edgeR_Age_avg, t_pheno_t_edata_keepDEG_edgeR_Tissue_avg, t_pheno_t_edata_keepDEG_edgeR_all_avg, file="edata_keepDEG_edgeR_aggregate_data.RData")
pdf(file="Step18_plot_boxplot_corr_plot_edata_keepDEG_edgeR_aka_expr_aggregate.pdf",height=10,width=10)
par(mfrow=c(2,1))
#########boxplot CellType Species Age Tissue and all ########
#box plot on aggregate CellType
boxplot(t_pheno_t_edata_keepDEG_edgeR_CellType_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_keepDEG_edgeR_CellType_avg", col="yellow")
#box plot on aggregate Species
boxplot(t_pheno_t_edata_keepDEG_edgeR_Species_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_keepDEG_edgeR_Species_avg", col="yellow")
#box plot on aggregate Age
boxplot(t_pheno_t_edata_keepDEG_edgeR_Age_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_keepDEG_edgeR_Age_avg", col="yellow")
#box plot on aggregate Tissue
boxplot(t_pheno_t_edata_keepDEG_edgeR_Tissue_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_keepDEG_edgeR_Tissue_avg", col="yellow")
#box plot on aggregate all
boxplot(t_pheno_t_edata_keepDEG_edgeR_all_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_keepDEG_edgeR_all_avg", col="yellow")
#########corr plot CellType Species Age Tissue and all ########
#create cor matrix on aggregate CellType
rel2 <- rcorr(as.matrix(t_pheno_t_edata_keepDEG_edgeR_CellType_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step18_result_edata_keepDEG_edgeR_CellType_Group_corr_pval_aggregate.csv")
#create cor matrix on aggregate Species
rel2 <- rcorr(as.matrix(t_pheno_t_edata_keepDEG_edgeR_Species_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step18_result_edata_keepDEG_edgeR_Species_Group_corr_pval_aggregate.csv")
#create cor matrix on aggregate Age
rel2 <- rcorr(as.matrix(t_pheno_t_edata_keepDEG_edgeR_Age_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step18_result_edata_keepDEG_edgeR_Age_Group_corr_pval_aggregate.csv")
#create cor matrix on aggregate Tissue
rel2 <- rcorr(as.matrix(t_pheno_t_edata_keepDEG_edgeR_Tissue_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step18_result_edata_keepDEG_edgeR_Tissue_Group_corr_pval_aggregate.csv")
#create cor matrix on aggregate all
rel2 <- rcorr(as.matrix(t_pheno_t_edata_keepDEG_edgeR_all_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step18_result_edata_keepDEG_edgeR_all_Group_corr_pval_aggregate.csv")
dev.off()
## png
## 2
pdf(file="Step18_plot_density_edata_keepDEG_edgeR_aka_expr_aggregate.pdf",height=10,width=10)
#########density plot CellType Species Age Tissue and all ########
#density plot on aggregate CellType
df=as.data.frame(t_pheno_t_edata_keepDEG_edgeR_CellType_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_keepDEG_edgeR_CellType_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) +
geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
labs(title="gene expression density curve for CellType",x="t_pheno_t_edata_keepDEG_edgeR_CellType_avg", y = "Density")
#density plot on aggregate Species
df=as.data.frame(t_pheno_t_edata_keepDEG_edgeR_Species_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_keepDEG_edgeR_Species_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) +
geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
labs(title="gene expression density curve for Species",x="t_pheno_t_edata_keepDEG_edgeR_Species_avg", y = "Density")
#density plot on aggregate Age
df=as.data.frame(t_pheno_t_edata_keepDEG_edgeR_Age_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_keepDEG_edgeR_Age_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) +
geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
labs(title="gene expression density curve for Age",x="t_pheno_t_edata_keepDEG_edgeR_Age_avg", y = "Density")
#density plot on aggregate Tissue
df=as.data.frame(t_pheno_t_edata_keepDEG_edgeR_Tissue_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_keepDEG_edgeR_Tissue_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) +
geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
labs(title="gene expression density curve for Tissue",x="t_pheno_t_edata_keepDEG_edgeR_Tissue_avg", y = "Density")
#density plot on aggregate all
df=as.data.frame(t_pheno_t_edata_keepDEG_edgeR_all_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_keepDEG_edgeR_all_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) +
geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
labs(title="gene expression density curve for all",x="t_pheno_t_edata_keepDEG_edgeR_all_avg", y = "Density")
dev.off()
## png
## 2
#save pheno, edata
save(pheno, metadata_1, edata, edata_keepDEG_edgeR, file="edata_keepDEG_edgeR_data.RData")
#save image
save.image(file="DEG_edgeR_temp.RData")
rm(list=ls())
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4048954 216.3 8074265 431.3 8074265 431.3
## Vcells 7853226 60.0 28449291 217.1 35561613 271.4
#Load pheno, edata required for steps below
load(file="done_pre_Step8.RData")
#Load DEG_edgeR DEG_Limma and DEG_Simple results required for steps below
load(file="DEG_edgeR_data.RData")
load(file="DEG_Limma_data.RData")
load(file="DEG_Simple_data.RData")
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4248901 227.0 8074265 431.3 8074265 431.3
## Vcells 17850741 136.2 28449291 217.1 35561613 271.4
#load required libraries
#https://www.bioconductor.org/packages/devel/bioc/vignettes/GeneOverlap/inst/doc/GeneOverlap.pdf
library(GeneOverlap)
library(VennDiagram)
## Warning: package 'VennDiagram' was built under R version 3.5.3
## Loading required package: grid
## Loading required package: futile.logger
## Warning: package 'futile.logger' was built under R version 3.5.3
##
## Attaching package: 'futile.logger'
## The following object is masked from 'package:mgcv':
##
## scat
#Create a list object of all the DEG genes
DEG_list<-list(DEG_edgeRnames,DEG_Limmanames, DEG_Simplenames)
names(DEG_list)<-c("DEG_edgeRnames","DEG_Limmanames", "DEG_Simplenames")
summary(DEG_list)
## Length Class Mode
## DEG_edgeRnames 1412 -none- character
## DEG_Limmanames 545 -none- character
## DEG_Simplenames 1036 -none- character
#calculate total number of unique genes in the DEG
size_estimate1=c(DEG_edgeRnames,DEG_Limmanames, DEG_Simplenames)
size_estimate2=unique(size_estimate1)
size_estimate3=length(size_estimate2)
#Calculate gene overlap matrix between the DEG genes from DEG_edgeRnames,DEG_Limmanames, DEG_Simplenames
gom.self <- newGOM(DEG_list, DEG_list, genome.size=size_estimate3)
#export p-value for overlap between DEG genes from DEG_edgeRnames,DEG_Limmanames, DEG_Simplenames
geom.mat<-getMatrix(gom.self, name="pval")
geom.mat
## DEG_edgeRnames DEG_Limmanames DEG_Simplenames
## DEG_edgeRnames 0.000000e+00 9.618864e-146 1
## DEG_Limmanames 9.618864e-146 0.000000e+00 1
## DEG_Simplenames 1.000000e+00 1.000000e+00 0
write.csv(geom.mat, "Step21_result_p-value_overlap_DEG_all.csv")
#Here the genes are counted for each list of genes and overlap is shown
gom.self["DEG_edgeRnames", "DEG_Limmanames"]
## GeneOverlap object:
## listA size=1412
## listB size=545
## Intersection size=544
## Overlapping p-value=9.6e-146
## Jaccard Index=0.4
gom.self["DEG_edgeRnames", "DEG_Simplenames"]
## GeneOverlap object:
## listA size=1412
## listB size=1036
## Intersection size=63
## Overlapping p-value=1
## Jaccard Index=0.0
gom.self["DEG_Limmanames", "DEG_Simplenames"]
## GeneOverlap object:
## listA size=545
## listB size=1036
## Intersection size=15
## Overlapping p-value=1
## Jaccard Index=0.0
pdf(file="Step21_plot_Visualization_heatmap_overlap_DEG_all.pdf",height=10,width=10)
#par(mfrow=c(2,1))
drawHeatmap(gom.self, what = "Jaccard")
#"odds.ratio" "Jaccard"
dev.off()
## png
## 2
pdf(file="Step21_plot_Visualization_vennDiagram_overlap_DEG_all.pdf",height=10,width=10)
#par(mfrow=c(2,1))
grid.draw(venn.diagram(list(DEG_edgeRnames = DEG_edgeRnames, DEG_Limmanames = DEG_Limmanames,
DEG_Simplenames = DEG_Simplenames), filename = NULL, fill = c("blue", "orange",
"green")))
dev.off()
## png
## 2
#save names of genes overlaped for DEG_edgeRnames,DEG_Limmanames, DEG_Simplenames in atleast 2
DEG_AllDEGmethodsnames_3grs=intersect(intersect(DEG_edgeRnames, DEG_Limmanames), DEG_Simplenames)
DEG_AllDEGmethodsnames_2grs_1=intersect(DEG_edgeRnames, DEG_Limmanames)
DEG_AllDEGmethodsnames_2grs_2=intersect(DEG_edgeRnames, DEG_Simplenames)
DEG_AllDEGmethodsnames_2grs_3=intersect(DEG_Limmanames, DEG_Simplenames)
DEG_AllDEGmethodsnames=cbind(DEG_AllDEGmethodsnames_3grs,DEG_AllDEGmethodsnames_2grs_1,DEG_AllDEGmethodsnames_2grs_2,DEG_AllDEGmethodsnames_2grs_3)
## Warning in cbind(DEG_AllDEGmethodsnames_3grs,
## DEG_AllDEGmethodsnames_2grs_1, : number of rows of result is not a multiple
## of vector length (arg 1)
head(DEG_AllDEGmethodsnames)
## DEG_AllDEGmethodsnames_3grs DEG_AllDEGmethodsnames_2grs_1
## [1,] "Fabp7" "Cd5l"
## [2,] "Bhlhe40" "Alox15"
## [3,] "A430105I19Rik" "Saa3"
## [4,] "Fam20a" "Serpinb2"
## [5,] "Tsku" "Arg1"
## [6,] "Gm9889" "Gm26863"
## DEG_AllDEGmethodsnames_2grs_2 DEG_AllDEGmethodsnames_2grs_3
## [1,] "Fabp7" "Bhlhe40"
## [2,] "Bhlhe40" "Tsku"
## [3,] "A430105I19Rik" "Fabp7"
## [4,] "Fam20a" "A430105I19Rik"
## [5,] "Tsku" "Fam20a"
## [6,] "Gm9889" "Ccm2l"
length(DEG_AllDEGmethodsnames)
## [1] 2176
#Keep only those genes which are differentially expressed in contrasts
dim(result_allContrasts_FC5.00_DEG_Limma)
## [1] 545 8
result_allContrasts_FC5.00_DEG_Limma_AllDEGmethods<-result_allContrasts_FC5.00_DEG_Limma[which(rownames(result_allContrasts_FC5.00_DEG_Limma) %in% DEG_AllDEGmethodsnames), ]
dim(result_allContrasts_FC5.00_DEG_Limma_AllDEGmethods)
## [1] 544 8
#unique and common genes between the different contrasts DEG_edgeRLimmaSimple are retained
DT::datatable(result_allContrasts_FC5.00_DEG_Limma_AllDEGmethods[1:7,])
#Export edata keepDEG_edgeRLimmaSimple genes only
write.csv(result_allContrasts_FC5.00_DEG_Limma_AllDEGmethods, "Step22_result_allContrasts_FC5.00_DEG_Limma_AllDEGmethods.csv")
#Keep only those genes which are differentially expressed in contrasts
dim(result_allContrasts_FC5.00_DEG_edgeR)
## [1] 1412 7
result_allContrasts_FC5.00_DEG_edgeR_AllDEGmethods<-result_allContrasts_FC5.00_DEG_edgeR[which(rownames(result_allContrasts_FC5.00_DEG_edgeR) %in% DEG_AllDEGmethodsnames), ]
dim(result_allContrasts_FC5.00_DEG_edgeR_AllDEGmethods)
## [1] 592 7
#unique and common genes between the different contrasts DEG_edgeRedgeRSimple are retained
DT::datatable(result_allContrasts_FC5.00_DEG_edgeR_AllDEGmethods[1:7,])
#Export edata keepDEG_edgeRedgeRSimple genes only
write.csv(result_allContrasts_FC5.00_DEG_edgeR_AllDEGmethods, "Step22_result_allContrasts_FC5.00_DEG_edgeR_AllDEGmethods.csv")
#Keep only those genes which are differentially expressed in contrasts
dim(result_allContrasts_FC5.00_DEG_Simple)
## [1] 1036 8
result_allContrasts_FC5.00_DEG_Simple_AllDEGmethods<-result_allContrasts_FC5.00_DEG_Simple[which(rownames(result_allContrasts_FC5.00_DEG_Simple) %in% DEG_AllDEGmethodsnames), ]
dim(result_allContrasts_FC5.00_DEG_Simple_AllDEGmethods)
## [1] 63 8
#unique and common genes between the different contrasts DEG_SimpleSimpleSimple are retained
DT::datatable(result_allContrasts_FC5.00_DEG_Simple_AllDEGmethods[1:7,])
#Export edata keepDEG_SimpleSimpleSimple genes only
write.csv(result_allContrasts_FC5.00_DEG_Simple_AllDEGmethods, "Step22_result_allContrasts_FC5.00_DEG_Simple_AllDEGmethods.csv")
#save DEG_AllDEGmethodsnames
save(DEG_AllDEGmethodsnames, file="DEG_AllDEGmethodsnames.RData")
#Keep only those genes which are differentially expressed in contrasts
dim(edata)
## [1] 22123 30
edata_keepDEG_AllDEGmethods<- edata[which(rownames(edata) %in% DEG_AllDEGmethodsnames), ]
dim(edata_keepDEG_AllDEGmethods)
## [1] 592 30
#unique and common genes between the different contrasts DEG_AllDEGmethods are retained
DT::datatable(edata_keepDEG_AllDEGmethods[1:7,1:7])
#Export edata keepDEG_AllDEGmethods genes only
write.csv(edata_keepDEG_AllDEGmethods, "Step22_result_edata_keepDEG_AllDEGmethods.csv")
edata_keepDEG_AllDEGmethods filtered to keepDEG_AllDEGmethods genes
#Summary Statistics edata_keepDEG_AllDEGmethods
DT::datatable(summary(edata_keepDEG_AllDEGmethods[1:7,1:7]))
write.csv(summary(edata_keepDEG_AllDEGmethods),"Step22_result_summary_stats_edata_keepDEG_AllDEGmethods.csv")
pdf(file="Step22_plot_Visualization_boxplot_MDS_PC_keepDEG_AllDEGmethods_for_edata.pdf",height=10,width=10)
par(mfrow=c(2,1))
############set colors for chunk Study###############
nconditions <- nlevels(as.factor(pheno$Study))
npal <- colorRampPalette(brewer.pal(nconditions, "Set1"))(nconditions)
condition_colors <- npal[as.integer(pheno$Study)]
############boxplot chunk###############
#Before boxplot edata_keepDEG_AllDEGmethods
par(mai=c(1,0.8,1,0.8))
boxplot(edata_keepDEG_AllDEGmethods, outline=FALSE, las=2, cex=0.25, main="edata_keepDEG_AllDEGmethods", col="yellow")
############MDSplot chunk###############
#Before MDSplot edata_keepDEG_AllDEGmethods
par(mai=c(1,0.8,1,0.8))
plotMDS(edata_keepDEG_AllDEGmethods, col=condition_colors, legend= "all", main="edata_keepDEG_AllDEGmethods", cex=0.5)#like PCA plot
## Warning in plot.window(...): "legend" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "legend" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "legend" is
## not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "legend" is
## not a graphical parameter
## Warning in box(...): "legend" is not a graphical parameter
## Warning in title(...): "legend" is not a graphical parameter
## Warning in text.default(x$x, x$y, labels = labels, cex = cex, ...):
## "legend" is not a graphical parameter
#######PCAplot chunk CellType colored by Study or batch########
#Before PCAplot edata_keepDEG_AllDEGmethods, colored by Study or batch, labels CellType
pca0=prcomp(t(edata_keepDEG_AllDEGmethods), center=T, scale=T)
names(pca0)
## [1] "sdev" "rotation" "center" "scale" "x"
#summary(pca0)
pca0$x[1:3,1:3]
## PC1 PC2 PC3
## astro_SRR1033783 -7.598809 -15.076933 -10.365641
## astro_SRR1033784 -6.298660 -15.941767 -11.194205
## endo_SRR1033795 -8.991872 -4.865064 -2.437354
plot(x=pca0$x[,1], y=pca0$x[,2],
cex=0,
xlab="PC1", ylab="PC2",
main="PC plot colored by Study edata_keepDEG_AllDEGmethods",
font=2
)
text(x=pca0$x[,1], y=pca0$x[,2],
labels=metadata_1$CellType,
col=as.numeric(metadata_1$Study),
cex=0.7, font=2)
legend("bottomright",
legend=c("GSE52564_to_SRP035309"),
text.col=c("red", "blue", "green")
)
#########PCAplot chunk Species colored by Study or batch######
#Before PCAplot edata_keepDEG_AllDEGmethods, colored by Study or batch, labels Species
pca0=prcomp(t(edata_keepDEG_AllDEGmethods), center=T, scale=T)
names(pca0)
## [1] "sdev" "rotation" "center" "scale" "x"
#summary(pca0)
pca0$x[1:3,1:3]
## PC1 PC2 PC3
## astro_SRR1033783 -7.598809 -15.076933 -10.365641
## astro_SRR1033784 -6.298660 -15.941767 -11.194205
## endo_SRR1033795 -8.991872 -4.865064 -2.437354
plot(x=pca0$x[,1], y=pca0$x[,2],
cex=0,
xlab="PC1", ylab="PC2",
main="PC plot colored by Study edata_keepDEG_AllDEGmethods",
font=2
)
text(x=pca0$x[,1], y=pca0$x[,2],
labels=metadata_1$Species,
col=as.numeric(metadata_1$Study),
cex=0.7, font=2)
legend("bottomright",
legend=c("GSE52564_to_SRP035309"),
text.col=c("red", "blue", "green")
)
#########PCAplot chunk Age colored by Study or batch########
#Before PCAplot edata_keepDEG_AllDEGmethods, colored by Study or batch, labels Age
pca0=prcomp(t(edata_keepDEG_AllDEGmethods), center=T, scale=T)
names(pca0)
## [1] "sdev" "rotation" "center" "scale" "x"
#summary(pca0)
pca0$x[1:3,1:3]
## PC1 PC2 PC3
## astro_SRR1033783 -7.598809 -15.076933 -10.365641
## astro_SRR1033784 -6.298660 -15.941767 -11.194205
## endo_SRR1033795 -8.991872 -4.865064 -2.437354
plot(x=pca0$x[,1], y=pca0$x[,2],
cex=0,
xlab="PC1", ylab="PC2",
main="PC plot colored by Study edata_keepDEG_AllDEGmethods",
font=2
)
text(x=pca0$x[,1], y=pca0$x[,2],
labels=metadata_1$Age,
col=as.numeric(metadata_1$Study),
cex=0.7, font=2)
legend("bottomright",
legend=c("GSE52564_to_SRP035309"),
text.col=c("red", "blue", "green")
)
dev.off()
## png
## 2
t_edata_keepDEG_AllDEGmethods<-t(edata_keepDEG_AllDEGmethods)
DT::datatable(t_edata_keepDEG_AllDEGmethods[,1:7])
DT::datatable(pheno)
dim(t_edata_keepDEG_AllDEGmethods)
## [1] 30 592
dim(pheno)
## [1] 30 5
#bind the pheno and edata_keepDEG_AllDEGmethods, this file can be used to predict trait based on a particular gene's gene expression
pheno_t_edata_keepDEG_AllDEGmethods<-cbind(pheno, t_edata_keepDEG_AllDEGmethods)
dim(pheno_t_edata_keepDEG_AllDEGmethods)
## [1] 30 597
DT::datatable(pheno_t_edata_keepDEG_AllDEGmethods[,1:10])
write.csv(pheno_t_edata_keepDEG_AllDEGmethods, "Step22_result_pheno_t_edata_keepDEG_AllDEGmethods_All_Group_aggregate.csv")
#select the CellType pheno drop others
pheno_t_edata_keepDEG_AllDEGmethods_CellType=pheno_t_edata_keepDEG_AllDEGmethods[!names(pheno_t_edata_keepDEG_AllDEGmethods) %in% c("Age", "Study", "Species", "Tissue")]
dim(pheno_t_edata_keepDEG_AllDEGmethods_CellType)
## [1] 30 593
DT::datatable(pheno_t_edata_keepDEG_AllDEGmethods_CellType[,1:10])
#aggregate edata_keepDEG_AllDEGmethods or expression for the CellType pheno
#https://campus.datacamp.com/courses/data-table-data-manipulation-r-tutorial/chapter-two-datatable-yeoman?ex=6
library(data.table)
setDT(pheno_t_edata_keepDEG_AllDEGmethods_CellType)
pheno_t_edata_keepDEG_AllDEGmethods_CellType_avg=pheno_t_edata_keepDEG_AllDEGmethods_CellType[, lapply(.SD, mean), by = .(CellType)]
DT::datatable(pheno_t_edata_keepDEG_AllDEGmethods_CellType_avg[,1:10])
#transpose genes to rows and variable to column
t_pheno_t_edata_keepDEG_AllDEGmethods_CellType_avg=t(pheno_t_edata_keepDEG_AllDEGmethods_CellType_avg[,-1])
colnames(t_pheno_t_edata_keepDEG_AllDEGmethods_CellType_avg)=pheno_t_edata_keepDEG_AllDEGmethods_CellType_avg$CellType
dim(t_pheno_t_edata_keepDEG_AllDEGmethods_CellType_avg)
## [1] 592 2
DT::datatable(t_pheno_t_edata_keepDEG_AllDEGmethods_CellType_avg[1:10,])
write.csv(t_pheno_t_edata_keepDEG_AllDEGmethods_CellType_avg, "Step22_result_edata_keepDEG_AllDEGmethods_CellType_aggregate.csv")
#select the Species pheno drop others
pheno_t_edata_keepDEG_AllDEGmethods_Species=pheno_t_edata_keepDEG_AllDEGmethods[!names(pheno_t_edata_keepDEG_AllDEGmethods) %in% c("Age", "Study", "CellType", "Tissue")]
dim(pheno_t_edata_keepDEG_AllDEGmethods_Species)
## [1] 30 593
DT::datatable(pheno_t_edata_keepDEG_AllDEGmethods_Species[,1:10])
#aggregate edata_keepDEG_AllDEGmethods or expression for the Species pheno
#https://campus.datacamp.com/courses/data-table-data-manipulation-r-tutorial/chapter-two-datatable-yeoman?ex=6
library(data.table)
setDT(pheno_t_edata_keepDEG_AllDEGmethods_Species)
pheno_t_edata_keepDEG_AllDEGmethods_Species_avg=pheno_t_edata_keepDEG_AllDEGmethods_Species[, lapply(.SD, mean), by = .(Species)]
DT::datatable(pheno_t_edata_keepDEG_AllDEGmethods_Species_avg[,1:10])
#transpose genes to rows and variable to column
t_pheno_t_edata_keepDEG_AllDEGmethods_Species_avg=t(pheno_t_edata_keepDEG_AllDEGmethods_Species_avg[,-1])
colnames(t_pheno_t_edata_keepDEG_AllDEGmethods_Species_avg)=pheno_t_edata_keepDEG_AllDEGmethods_Species_avg$Species
dim(t_pheno_t_edata_keepDEG_AllDEGmethods_Species_avg)
## [1] 592 2
DT::datatable(t_pheno_t_edata_keepDEG_AllDEGmethods_Species_avg[1:10,])
write.csv(t_pheno_t_edata_keepDEG_AllDEGmethods_Species_avg, "Step22_result_edata_keepDEG_AllDEGmethods_Species_aggregate.csv")
#select the Age pheno drop others
pheno_t_edata_keepDEG_AllDEGmethods_Age=pheno_t_edata_keepDEG_AllDEGmethods[!names(pheno_t_edata_keepDEG_AllDEGmethods) %in% c("CellType", "Study", "Species", "Tissue")]
dim(pheno_t_edata_keepDEG_AllDEGmethods_Age)
## [1] 30 593
DT::datatable(pheno_t_edata_keepDEG_AllDEGmethods_Age[,1:10])
#aggregate edata_keepDEG_AllDEGmethods or expression for the Age pheno
#https://campus.datacamp.com/courses/data-table-data-manipulation-r-tutorial/chapter-two-datatable-yeoman?ex=6
library(data.table)
setDT(pheno_t_edata_keepDEG_AllDEGmethods_Age)
pheno_t_edata_keepDEG_AllDEGmethods_Age_avg=pheno_t_edata_keepDEG_AllDEGmethods_Age[, lapply(.SD, mean), by = .(Age)]
DT::datatable(pheno_t_edata_keepDEG_AllDEGmethods_Age_avg[,1:10])
#transpose genes to rows and variable to column
t_pheno_t_edata_keepDEG_AllDEGmethods_Age_avg=t(pheno_t_edata_keepDEG_AllDEGmethods_Age_avg[,-1])
colnames(t_pheno_t_edata_keepDEG_AllDEGmethods_Age_avg)=pheno_t_edata_keepDEG_AllDEGmethods_Age_avg$Age
dim(t_pheno_t_edata_keepDEG_AllDEGmethods_Age_avg)
## [1] 592 2
DT::datatable(t_pheno_t_edata_keepDEG_AllDEGmethods_Age_avg[1:10,])
write.csv(t_pheno_t_edata_keepDEG_AllDEGmethods_Age_avg, "Step22_result_edata_keepDEG_AllDEGmethods_Age_aggregate.csv")
#select the Tissue pheno drop others
pheno_t_edata_keepDEG_AllDEGmethods_Tissue=pheno_t_edata_keepDEG_AllDEGmethods[!names(pheno_t_edata_keepDEG_AllDEGmethods) %in% c("CellType", "Study", "Species", "Age")]
dim(pheno_t_edata_keepDEG_AllDEGmethods_Tissue)
## [1] 30 593
DT::datatable(pheno_t_edata_keepDEG_AllDEGmethods_Tissue[,1:10])
#aggregate edata_keepDEG_AllDEGmethods or expression for the Tissue pheno
#https://campus.datacamp.com/courses/data-table-data-manipulation-r-tutorial/chapter-two-datatable-yeoman?ex=6
library(data.table)
setDT(pheno_t_edata_keepDEG_AllDEGmethods_Tissue)
pheno_t_edata_keepDEG_AllDEGmethods_Tissue_avg=pheno_t_edata_keepDEG_AllDEGmethods_Tissue[, lapply(.SD, mean), by = .(Tissue)]
DT::datatable(pheno_t_edata_keepDEG_AllDEGmethods_Tissue_avg[,1:10])
#transpose genes to rows and variable to column
t_pheno_t_edata_keepDEG_AllDEGmethods_Tissue_avg=t(pheno_t_edata_keepDEG_AllDEGmethods_Tissue_avg[,-1])
colnames(t_pheno_t_edata_keepDEG_AllDEGmethods_Tissue_avg)=pheno_t_edata_keepDEG_AllDEGmethods_Tissue_avg$Tissue
dim(t_pheno_t_edata_keepDEG_AllDEGmethods_Tissue_avg)
## [1] 592 4
DT::datatable(t_pheno_t_edata_keepDEG_AllDEGmethods_Tissue_avg[1:10,])
write.csv(t_pheno_t_edata_keepDEG_AllDEGmethods_Tissue_avg, "Step22_result_edata_keepDEG_AllDEGmethods_Tissue_aggregate.csv")
t_pheno_t_edata_keepDEG_AllDEGmethods_all_avg<-cbind(t_pheno_t_edata_keepDEG_AllDEGmethods_CellType_avg, t_pheno_t_edata_keepDEG_AllDEGmethods_Species_avg, t_pheno_t_edata_keepDEG_AllDEGmethods_Age_avg, t_pheno_t_edata_keepDEG_AllDEGmethods_Tissue_avg)
DT::datatable(head(t_pheno_t_edata_keepDEG_AllDEGmethods_all_avg))
write.csv(t_pheno_t_edata_keepDEG_AllDEGmethods_all_avg, "Step22_result_edata_keepDEG_AllDEGmethods_All_Group_aggregate.csv")
#save pheno_t_edata_keepDEG_AllDEGmethods, t_pheno_t_edata_keepDEG_AllDEGmethods_CellType_avg, t_pheno_t_edata_keepDEG_AllDEGmethods_Species_avg, t_pheno_t_edata_keepDEG_AllDEGmethods_Age_avg, t_pheno_t_edata_keepDEG_AllDEGmethods_Tissue_avg, t_pheno_t_edata_keepDEG_AllDEGmethods_all_avg
save(pheno_t_edata_keepDEG_AllDEGmethods, t_pheno_t_edata_keepDEG_AllDEGmethods_CellType_avg, t_pheno_t_edata_keepDEG_AllDEGmethods_Species_avg, t_pheno_t_edata_keepDEG_AllDEGmethods_Age_avg, t_pheno_t_edata_keepDEG_AllDEGmethods_Tissue_avg, t_pheno_t_edata_keepDEG_AllDEGmethods_all_avg, file="edata_keepDEG_AllDEGmethods_aggregate_data.RData")
pdf(file="Step22_plot_boxplot_corr_plot_edata_keepDEG_AllDEGmethods_aka_expr_aggregate.pdf",height=10,width=10)
par(mfrow=c(2,1))
#########boxplot CellType Species Age Tissue and all ########
#box plot on aggregate CellType
boxplot(t_pheno_t_edata_keepDEG_AllDEGmethods_CellType_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_keepDEG_AllDEGmethods_CellType_avg", col="yellow")
#box plot on aggregate Species
boxplot(t_pheno_t_edata_keepDEG_AllDEGmethods_Species_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_keepDEG_AllDEGmethods_Species_avg", col="yellow")
#box plot on aggregate Age
boxplot(t_pheno_t_edata_keepDEG_AllDEGmethods_Age_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_keepDEG_AllDEGmethods_Age_avg", col="yellow")
#box plot on aggregate Tissue
boxplot(t_pheno_t_edata_keepDEG_AllDEGmethods_Tissue_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_keepDEG_AllDEGmethods_Tissue_avg", col="yellow")
#box plot on aggregate all
boxplot(t_pheno_t_edata_keepDEG_AllDEGmethods_all_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_keepDEG_AllDEGmethods_all_avg", col="yellow")
#########corr plot CellType Species Age Tissue and all ########
#create cor matrix on aggregate CellType
rel2 <- rcorr(as.matrix(t_pheno_t_edata_keepDEG_AllDEGmethods_CellType_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step22_result_edata_keepDEG_AllDEGmethods_CellType_Group_corr_pval_aggregate.csv")
#create cor matrix on aggregate Species
rel2 <- rcorr(as.matrix(t_pheno_t_edata_keepDEG_AllDEGmethods_Species_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step22_result_edata_keepDEG_AllDEGmethods_Species_Group_corr_pval_aggregate.csv")
#create cor matrix on aggregate Age
rel2 <- rcorr(as.matrix(t_pheno_t_edata_keepDEG_AllDEGmethods_Age_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step22_result_edata_keepDEG_AllDEGmethods_Age_Group_corr_pval_aggregate.csv")
#create cor matrix on aggregate Tissue
rel2 <- rcorr(as.matrix(t_pheno_t_edata_keepDEG_AllDEGmethods_Tissue_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step22_result_edata_keepDEG_AllDEGmethods_Tissue_Group_corr_pval_aggregate.csv")
#create cor matrix on aggregate all
rel2 <- rcorr(as.matrix(t_pheno_t_edata_keepDEG_AllDEGmethods_all_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step22_result_edata_keepDEG_AllDEGmethods_all_Group_corr_pval_aggregate.csv")
dev.off()
## png
## 2
pdf(file="Step22_plot_density_edata_keepDEG_AllDEGmethods_aka_expr_aggregate.pdf",height=10,width=10)
#########density plot CellType Species Age Tissue and all ########
#density plot on aggregate CellType
df=as.data.frame(t_pheno_t_edata_keepDEG_AllDEGmethods_CellType_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_keepDEG_AllDEGmethods_CellType_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) +
geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
labs(title="gene expression density curve for CellType",x="t_pheno_t_edata_keepDEG_AllDEGmethods_CellType_avg", y = "Density")
#density plot on aggregate Species
df=as.data.frame(t_pheno_t_edata_keepDEG_AllDEGmethods_Species_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_keepDEG_AllDEGmethods_Species_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) +
geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
labs(title="gene expression density curve for Species",x="t_pheno_t_edata_keepDEG_AllDEGmethods_Species_avg", y = "Density")
#density plot on aggregate Age
df=as.data.frame(t_pheno_t_edata_keepDEG_AllDEGmethods_Age_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_keepDEG_AllDEGmethods_Age_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) +
geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
labs(title="gene expression density curve for Age",x="t_pheno_t_edata_keepDEG_AllDEGmethods_Age_avg", y = "Density")
#density plot on aggregate Tissue
df=as.data.frame(t_pheno_t_edata_keepDEG_AllDEGmethods_Tissue_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_keepDEG_AllDEGmethods_Tissue_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) +
geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
labs(title="gene expression density curve for Tissue",x="t_pheno_t_edata_keepDEG_AllDEGmethods_Tissue_avg", y = "Density")
#density plot on aggregate all
df=as.data.frame(t_pheno_t_edata_keepDEG_AllDEGmethods_all_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_keepDEG_AllDEGmethods_all_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) +
geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
labs(title="gene expression density curve for all",x="t_pheno_t_edata_keepDEG_AllDEGmethods_all_avg", y = "Density")
dev.off()
## png
## 2
#save pheno, edata required for WGCNA
save(pheno, metadata_1, edata, edata_keepDEG_AllDEGmethods, file="edata_keepDEG_AllDEGmethods_data.RData")
#save image
save.image(file="DEG_AllDEGmethods_temp.RData")
rm(list=ls())
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4115783 219.9 8074265 431.3 8074265 431.3
## Vcells 7968671 60.8 28449291 217.1 35561613 271.4
End of DEG pipeline, uses edata and modified metadata
sessionInfo()
## R version 3.5.2 (2018-12-20)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 16299)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United States.1252
## [2] LC_CTYPE=English_United States.1252
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.1252
##
## attached base packages:
## [1] grid stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] VennDiagram_1.6.20 futile.logger_1.4.3 GeneOverlap_1.16.0
## [4] edgeR_3.22.5 dplyr_0.8.0.1 ggrepel_0.8.1
## [7] data.table_1.12.2 expss_0.8.11 filesstrings_3.0.0
## [10] stringr_1.4.0 corrplot_0.84 viridis_0.5.1
## [13] viridisLite_0.3.0 Hmisc_4.2-0 Formula_1.2-3
## [16] lattice_0.20-38 RColorBrewer_1.1-2 gplots_3.0.1.1
## [19] ggmap_3.0.0 ggplot2_3.1.1 pamr_1.56.1
## [22] survival_2.44-1.1 cluster_2.0.8 sva_3.28.0
## [25] BiocParallel_1.14.2 genefilter_1.62.0 mgcv_1.8-28
## [28] nlme_3.1-137 limma_3.36.5
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.4-1 rjson_0.2.20 htmlTable_1.13.1
## [4] base64enc_0.1-3 rstudioapi_0.10 DT_0.5
## [7] bit64_0.9-7 AnnotationDbi_1.42.1 splines_3.5.2
## [10] knitr_1.22 jsonlite_1.6 annotate_1.58.0
## [13] png_0.1-7 shiny_1.3.2 compiler_3.5.2
## [16] httr_1.4.0 backports_1.1.4 assertthat_0.2.1
## [19] Matrix_1.2-17 lazyeval_0.2.2 later_0.8.0
## [22] formatR_1.6 acepack_1.4.1 htmltools_0.3.6
## [25] tools_3.5.2 gtable_0.3.0 glue_1.3.1
## [28] reshape2_1.4.3 Rcpp_1.0.1 Biobase_2.40.0
## [31] gdata_2.18.0 crosstalk_1.0.0 xfun_0.6
## [34] mime_0.6 gtools_3.8.1 XML_3.98-1.19
## [37] scales_1.0.0 promises_1.0.1 parallel_3.5.2
## [40] lambda.r_1.2.3 yaml_2.2.0 strex_0.1.3
## [43] memoise_1.1.0 gridExtra_2.3 rpart_4.1-15
## [46] latticeExtra_0.6-28 stringi_1.4.3 RSQLite_2.1.1
## [49] S4Vectors_0.18.3 checkmate_1.9.1 caTools_1.17.1.2
## [52] BiocGenerics_0.26.0 RgoogleMaps_1.4.3 rlang_0.3.4
## [55] pkgconfig_2.0.2 matrixStats_0.54.0 bitops_1.0-6
## [58] evaluate_0.13 purrr_0.3.2 htmlwidgets_1.3
## [61] labeling_0.3 bit_1.1-14 tidyselect_0.2.5
## [64] plyr_1.8.4 magrittr_1.5 R6_2.4.0
## [67] IRanges_2.14.12 DBI_1.0.0 pillar_1.3.1
## [70] foreign_0.8-71 withr_2.1.2 RCurl_1.95-4.12
## [73] nnet_7.3-12 tibble_2.1.1 crayon_1.3.4
## [76] futile.options_1.0.1 KernSmooth_2.23-15 rmarkdown_1.12
## [79] jpeg_0.1-8 locfit_1.5-9.1 blob_1.1.1
## [82] digest_0.6.18 xtable_1.8-4 tidyr_0.8.3
## [85] httpuv_1.5.1 stats4_3.5.2 munsell_0.5.0
toLatex(sessionInfo())
## \begin{itemize}\raggedright
## \item R version 3.5.2 (2018-12-20), \verb|x86_64-w64-mingw32|
## \item Locale: \verb|LC_COLLATE=English_United States.1252|, \verb|LC_CTYPE=English_United States.1252|, \verb|LC_MONETARY=English_United States.1252|, \verb|LC_NUMERIC=C|, \verb|LC_TIME=English_United States.1252|
## \item Running under: \verb|Windows 10 x64 (build 16299)|
## \item Matrix products: default
## \item Base packages: base, datasets, graphics, grDevices, grid,
## methods, stats, utils
## \item Other packages: BiocParallel~1.14.2, cluster~2.0.8,
## corrplot~0.84, data.table~1.12.2, dplyr~0.8.0.1, edgeR~3.22.5,
## expss~0.8.11, filesstrings~3.0.0, Formula~1.2-3,
## futile.logger~1.4.3, genefilter~1.62.0, GeneOverlap~1.16.0,
## ggmap~3.0.0, ggplot2~3.1.1, ggrepel~0.8.1, gplots~3.0.1.1,
## Hmisc~4.2-0, lattice~0.20-38, limma~3.36.5, mgcv~1.8-28,
## nlme~3.1-137, pamr~1.56.1, RColorBrewer~1.1-2, stringr~1.4.0,
## survival~2.44-1.1, sva~3.28.0, VennDiagram~1.6.20,
## viridis~0.5.1, viridisLite~0.3.0
## \item Loaded via a namespace (and not attached): acepack~1.4.1,
## annotate~1.58.0, AnnotationDbi~1.42.1, assertthat~0.2.1,
## backports~1.1.4, base64enc~0.1-3, Biobase~2.40.0,
## BiocGenerics~0.26.0, bit~1.1-14, bit64~0.9-7, bitops~1.0-6,
## blob~1.1.1, caTools~1.17.1.2, checkmate~1.9.1,
## colorspace~1.4-1, compiler~3.5.2, crayon~1.3.4,
## crosstalk~1.0.0, DBI~1.0.0, digest~0.6.18, DT~0.5,
## evaluate~0.13, foreign~0.8-71, formatR~1.6,
## futile.options~1.0.1, gdata~2.18.0, glue~1.3.1, gridExtra~2.3,
## gtable~0.3.0, gtools~3.8.1, htmlTable~1.13.1, htmltools~0.3.6,
## htmlwidgets~1.3, httpuv~1.5.1, httr~1.4.0, IRanges~2.14.12,
## jpeg~0.1-8, jsonlite~1.6, KernSmooth~2.23-15, knitr~1.22,
## labeling~0.3, lambda.r~1.2.3, later~0.8.0,
## latticeExtra~0.6-28, lazyeval~0.2.2, locfit~1.5-9.1,
## magrittr~1.5, Matrix~1.2-17, matrixStats~0.54.0,
## memoise~1.1.0, mime~0.6, munsell~0.5.0, nnet~7.3-12,
## parallel~3.5.2, pillar~1.3.1, pkgconfig~2.0.2, plyr~1.8.4,
## png~0.1-7, promises~1.0.1, purrr~0.3.2, R6~2.4.0, Rcpp~1.0.1,
## RCurl~1.95-4.12, reshape2~1.4.3, RgoogleMaps~1.4.3,
## rjson~0.2.20, rlang~0.3.4, rmarkdown~1.12, rpart~4.1-15,
## RSQLite~2.1.1, rstudioapi~0.10, S4Vectors~0.18.3,
## scales~1.0.0, shiny~1.3.2, splines~3.5.2, stats4~3.5.2,
## strex~0.1.3, stringi~1.4.3, tibble~2.1.1, tidyr~0.8.3,
## tidyselect~0.2.5, tools~3.5.2, withr~2.1.2, xfun~0.6,
## XML~3.98-1.19, xtable~1.8-4, yaml~2.2.0
## \end{itemize}
#Organize of files
#library(filesstrings)
#raw, expr normalized and metadata
dir.create("merged_expr_metadata")
file.move(list.files(pattern = '*rawdata_transformation_FewSamples.pdf'), "merged_expr_metadata")
## 1 file moved. 0 failed.
file.move(list.files(pattern = '*rawdata_transformation.pdf'), "merged_expr_metadata")
## 1 file moved. 0 failed.
file.move(list.files(pattern = '*SampleID.csv'), "merged_expr_metadata")
## 2 files moved. 0 failed.
file.move(list.files(pattern = '*SampleID_coded.csv'), "merged_expr_metadata")
## 0 files moved. 0 failed.
file.move(list.files(pattern = 'preSVA_Merging.RData'), "merged_expr_metadata")
## 1 file moved. 0 failed.
file.move(list.files(pattern = 'done_pre_Step8.RData'), "merged_expr_metadata")
## 1 file moved. 0 failed.
file.move(list.files(pattern = 'GPL*'), "merged_expr_metadata")
## 0 files moved. 0 failed.
file.move(list.files(pattern = '*data_non-log'), "merged_expr_metadata")
## 0 files moved. 0 failed.
file.move(list.files(pattern = '*data_originalfromGEO'), "merged_expr_metadata")
## 0 files moved. 0 failed.
file.move(list.files(pattern = '*Annotation_Expr_GeneMs.csv'), "merged_expr_metadata")
## 0 files moved. 0 failed.
file.move(list.files(pattern = '*metadata.csv'), "merged_expr_metadata")
## 0 files moved. 0 failed.
file.move(list.files(pattern = '*SampleID_coded.csv'), "merged_expr_metadata")
## 0 files moved. 0 failed.
file.move(list.files(pattern = 'preSVA*'), "merged_expr_metadata")
## 0 files moved. 0 failed.
#Aggregate expression by variable and plots
dir.create("Aggregate_csv_plots")
file.move(list.files(pattern = '*aggregate.csv'), "Aggregate_csv_plots")
## 55 files moved. 0 failed.
file.move(list.files(pattern = '*aggregate.pdf'), "Aggregate_csv_plots")
## 10 files moved. 0 failed.
file.move(list.files(pattern = 'edata_aggregate_data.RData'), "Aggregate_csv_plots")
## 1 file moved. 0 failed.
file.move(list.files(pattern = 'edata_lmY_aggregate_data.RData'), "Aggregate_csv_plots")
## 0 files moved. 0 failed.
#edata_keepDEG results and plots
dir.create("edata_keepDEG")
file.move(list.files(pattern = '*edata.csv'), "edata_keepDEG")
## 0 files moved. 0 failed.
file.move(list.files(pattern = 'Step10_plot*'), "edata_keepDEG")
## 0 files moved. 0 failed.
file.move(list.files(pattern = '*edata_keepDEG_Limma.csv'), "edata_keepDEG")
## 2 files moved. 0 failed.
file.move(list.files(pattern = 'Step14_plot*'), "edata_keepDEG")
## 1 file moved. 0 failed.
file.move(list.files(pattern = '*edata_keepDEG_Simple.csv'), "edata_keepDEG")
## 2 files moved. 0 failed.
file.move(list.files(pattern = 'Step16_plot*'), "edata_keepDEG")
## 1 file moved. 0 failed.
file.move(list.files(pattern = '*edata_keepDEG_edgeR.csv'), "edata_keepDEG")
## 2 files moved. 0 failed.
file.move(list.files(pattern = 'Step18_plot*'), "edata_keepDEG")
## 1 file moved. 0 failed.
file.move(list.files(pattern = '*edata_keepDEG_AllDEGmethods.csv'), "edata_keepDEG")
## 2 files moved. 0 failed.
file.move(list.files(pattern = 'Step22_plot*'), "edata_keepDEG")
## 1 file moved. 0 failed.
file.move(list.files(pattern = '*edata_pca_unscaled.csv'), "edata_keepDEG")
## 1 file moved. 0 failed.
file.move(list.files(pattern = 'Step7_plot*'), "edata_keepDEG")
## 1 file moved. 0 failed.
file.move(list.files(pattern = 'edata_keepDEG_AllDEGmethods_data'), "edata_keepDEG")
## 1 file moved. 0 failed.
#DEG_Limma results
dir.create("DEG_Limma_results")
file.move(list.files(pattern = '*FC5.00_DEG_Limma.csv'), "DEG_Limma_results")
## 3 files moved. 0 failed.
file.move(list.files(pattern = '*full_DEG_Limma.csv'), "DEG_Limma_results")
## 3 files moved. 0 failed.
file.move(list.files(pattern = '*_DEG_Limma.pdf'), "DEG_Limma_results")
## 5 files moved. 0 failed.
file.move(list.files(pattern = 'DEG_Limma_data.RData'), "DEG_Limma_results")
## 2 files moved. 0 failed.
file.move(list.files(pattern = '*Limma_aggregate_data.RData'), "DEG_Limma_results")
## 1 file moved. 0 failed.
file.move(list.files(pattern = 'DEG_Limma_temp.RData'), "DEG_Limma_results")
## 1 file moved. 0 failed.
#DEG_Simple results
dir.create("DEG_Simple_results")
file.move(list.files(pattern = '*FC5.00_DEG_Simple.csv'), "DEG_Simple_results")
## 2 files moved. 0 failed.
file.move(list.files(pattern = '*full_DEG_Simple.csv'), "DEG_Simple_results")
## 2 files moved. 0 failed.
file.move(list.files(pattern = '*_DEG_Simple.pdf'), "DEG_Simple_results")
## 4 files moved. 0 failed.
file.move(list.files(pattern = 'DEG_Simple_data.RData'), "DEG_Simple_results")
## 2 files moved. 0 failed.
file.move(list.files(pattern = '*Simple_aggregate_data.RData'), "DEG_Simple_results")
## 1 file moved. 0 failed.
file.move(list.files(pattern = 'DEG_Simple_temp.RData'), "DEG_Simple_results")
## 1 file moved. 0 failed.
#DEG_edgeR results
dir.create("DEG_edgeR_results")
file.move(list.files(pattern = '*FC5.00_DEG_edgeR.csv'), "DEG_edgeR_results")
## 2 files moved. 0 failed.
file.move(list.files(pattern = '*full_DEG_edgeR.csv'), "DEG_edgeR_results")
## 2 files moved. 0 failed.
file.move(list.files(pattern = '*_DEG_edgeR.pdf'), "DEG_edgeR_results")
## 4 files moved. 0 failed.
file.move(list.files(pattern = 'DEG_edgeR_data.RData'), "DEG_edgeR_results")
## 2 files moved. 0 failed.
file.move(list.files(pattern = '*edgeR_aggregate_data.RData'), "DEG_edgeR_results")
## 1 file moved. 0 failed.
file.move(list.files(pattern = 'DEG_edgeR_temp.RData'), "DEG_edgeR_results")
## 1 file moved. 0 failed.
#DEG_Cuff results
dir.create("DEG_Cuff_results")
file.move(list.files(pattern = '*FC5.00_DEG_Cuff.csv'), "DEG_Cuff_results")
## 0 files moved. 0 failed.
file.move(list.files(pattern = '*full_DEG_Cuff.csv'), "DEG_Cuff_results")
## 0 files moved. 0 failed.
file.move(list.files(pattern = '*_DEG_Cuff.pdf'), "DEG_Cuff_results")
## 0 files moved. 0 failed.
file.move(list.files(pattern = 'DEG_Cuff_data.RData'), "DEG_Cuff_results")
## 0 files moved. 0 failed.
file.move(list.files(pattern = '*Cuff_aggregate_data.RData'), "DEG_Cuff_results")
## 0 files moved. 0 failed.
file.move(list.files(pattern = 'DEG_Cuff_temp.RData'), "DEG_Cuff_results")
## 0 files moved. 0 failed.
#DEG_AllDEGmethods results
dir.create("DEG_AllDEGmethods_results")
file.move(list.files(pattern = 'Step21*'), "DEG_AllDEGmethods_results")
## 6 files moved. 0 failed.
file.move(list.files(pattern = '*AllDEGmethods.csv'), "DEG_AllDEGmethods_results")
## 0 files moved. 0 failed.
file.move(list.files(pattern = 'DEG_AllDEGmethods_temp.RData'), "DEG_AllDEGmethods_results")
## 1 file moved. 0 failed.
file.move(list.files(pattern = '*AllDEGmethods_aggregate_data.RData'), "DEG_AllDEGmethods_results")
## 1 file moved. 0 failed.
file.move(list.files(pattern = 'DEG_AllDEGmethodsnames.RData'), "DEG_AllDEGmethods_results")
## 1 file moved. 0 failed.
#remove extra files
file.remove(list.files(pattern ='VennDiagram*'))
## [1] TRUE
file.remove(list.files(pattern ='temp.RData'))
## logical(0)
#Remove .RData and clear environment to free up memory
rm(list=ls())
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4123589 220.3 8074265 431.3 8074265 431.3
## Vcells 7966044 60.8 28449291 217.1 35561613 271.4